Пример #1
0
    async def build_profile(self, device, headless):
        scenario = self.scenario
        profile = self.env.profile
        customization_data = self.customization_data

        scenario_func = scenarii[scenario]
        if scenario in customization_data.get("scenario", {}):
            options = customization_data["scenario"][scenario]
            LOG("Loaded options for that scenario %s" % str(options))
        else:
            options = {}

        # Adding general options
        options["platform"] = self.env.target_platform

        if not self.force_new:
            try:
                custom_name = customization_data["name"]
                get_profile(profile, self.env.target_platform, scenario, custom_name)
            except ProfileNotFoundError:
                # XXX we'll use a fresh profile for now
                fresh_profile(profile, customization_data)
        else:
            fresh_profile(profile, customization_data)

        LOG("Updating profile located at %r" % profile)
        metadata = Metadata(profile)

        LOG("Starting the Gecko app...")
        self.env.prepare(logfile=self._log_filename("adb"))
        geckodriver_logs = self._log_filename("geckodriver")
        LOG("Writing geckodriver logs in %s" % geckodriver_logs)
        try:
            firefox_instance = Firefox(**self.env.get_browser_args(headless))
            with open(geckodriver_logs, "w") as glog:
                async with get_session(
                    self.env.get_geckodriver(log_file=glog), firefox_instance
                ) as session:
                    self.env.check_session(session)
                    LOG("Running the %s scenario" % scenario)
                    metadata.update(await scenario_func(session, options))
                    LOG("%s scenario done." % scenario)

        except Exception:
            ERROR("%s scenario broke!" % scenario)

        self.env.stop_browser()
        self.env.collect_profile()

        # writing metadata
        metadata.write(
            name=self.scenario,
            customization=self.customization_data["name"],
            version=self.env.get_browser_version(),
            platform=self.env.target_platform,
        )

        LOG("Profile at %s" % profile)
        LOG("Done.")
        return metadata
Пример #2
0
async def scraper(url:str, i=-1, timeout:int=60, start=None)-> list:
    """Scrapes the HTML of the passed URL using arsenic webdriver.
    Returns a list of dictionaries, with product id, slug, link."""

    service = services.Chromedriver()
    browser = browsers.Chrome()
    browser.capabilities = {
        'goog:chromeOptions': {'args': ['--headless', '--disable-gpu']}
    }

    async with get_session(service, browser) as session:
        # if the page doesn't respond, return an empty URLs list.
        try:
            await asyncio.wait_for(session.get(url), timeout=timeout)
        except asyncio.TimeoutError:
            return []

        body = await session.get_page_source()
        links = await get_fabric_links(body)

        if start is not None:
            end = time.time() - start
            print(f"{i} took {end} seconds")

        return links
Пример #3
0
async def scraper(url: str, i=-1, timeout: int = 60, start=None) -> dict:
    """Scrapes the HTML of the passed URL using arsenic webdriver.
    Returns a list of dictionaries, with product id, slug, link."""

    service = services.Chromedriver()
    browser = browsers.Chrome()
    browser.capabilities = {
        'goog:chromeOptions': {
            'args': ['--headless', '--disable-gpu']
        }
    }

    async with get_session(service, browser) as session:
        # if the page doesn't respond, return an empty URLs list.
        try:
            await asyncio.wait_for(session.get(url), timeout=60)
        except asyncio.TimeoutError:
            return []

        await asyncio.sleep(10)
        body = await session.get_page_source()  # getting raw HTML
        html_r = await get_parsable_html(body)  # converting to parsable HTML
        links = await get_fabric_links(html_r)  # getting relative links
        product_data = await get_product_data(url, html_r)

        dataset = {"links": links, "product_data": product_data}
        #_____________printing time consumption_________________#
        if start is not None:
            end = time.time() - start
            print(f"{i} took {end} seconds")

        return dataset
Пример #4
0
	async def google(self, ctx, *, query):
		await ctx.channel.trigger_typing()
		try:
			await self.bot.loop.run_in_executor(None, func=functools.partial(self.assist, ctx.author.id, query))
		except Exception as e:
			if 'text_query too long.' in str(e):
				return await ctx.error(f'That query is too long. Try something shorter')
			return await ctx.error(f'Something went wrong.')
		if ctx.author.id not in self.responses:
			return await ctx.send(f'<a:okaygoogle:661951491082551306> Something went wrong. Try again later')
		async with get_session(self.service, self.browser) as session:
			await session.set_window_size(1920, 1080)
			sub = 'devapi' if self.bot.dev else 'api'
			await session.get(f'https://{sub}.gaminggeek.dev/assist/{ctx.author.id}')
			try:
				await session.execute_script('document.body.style.backgroundImage = \'url("https://picsum.photos/1920/1080")\';')
				namere = '<div class=\"show_text_content\">Your name is .*\.<\/div>'
				namesub = f'<div class=\'show_text_content\'>Your name is {ctx.author.name}.</div>'
				await session.execute_script(f'document.body.innerHTML = document.body.innerHTML.replace(/{namere}/gm, "{namesub}");')
				namere = '<div class=\"show_text_content\">I remember you telling me your name was .*\.<\/div>'
				namesub = f'<div class=\'show_text_content\'>I remember you telling me your name was {ctx.author.name}.</div>'
				await session.execute_script(f'document.body.innerHTML = document.body.innerHTML.replace(/{namere}/gm, "{namesub}");')
			except Exception:
				pass
				# await ctx.error('script did an oopsie')
			await asyncio.sleep(1.5)
			await ctx.send(file=discord.File((await session.get_screenshot()), filename='google.png'))
			return await session.close()
		return await ctx.error('If you\'re seeing this, something went wrong I guess ¯\_(ツ)_/¯')
Пример #5
0
    async def make_snapshot(self, website: str):

        if self.session is None:
            await self.init_session()

        while self.busy:
            await asyncio.sleep(1)

        async with get_session(self.service, self.browser) as session:

            self.busy = True

            await session.get(website)
            image = await session.get_screenshot()
            image.seek(0)

            session.close()

            headers = {"Authorization": "Client-ID 6656d64547a5031"}
            data = {"image": image}

            async with self.session.post("https://api.imgur.com/3/image",
                                         data=data,
                                         headers=headers) as r:

                link = (await r.json())["data"]["link"]
                r.close()

                del image

                self.busy = False
                return link
Пример #6
0
async def visit(config):
    service = services.Geckodriver()
    browser = browsers.Firefox(
        **{"moz:firefoxOptions": {
            "args": ["-headless"]
        }})

    logging.info("Hitting url " + config["url"])
    try:
        async with get_session(service, browser) as session:
            await session.delete_all_cookies()
            await session.get(config["url"])

            for k, c in config.get("cookies", {}).items():
                value = c.get("value", "")
                domain = c.get("domain", None)
                path = c.get("path", "/")
                secure = c.get("secure", False)
                await session.add_cookie(k,
                                         value,
                                         path=path,
                                         domain=domain,
                                         secure=secure)

            await session.get(config["url"])
    except Exception as e:
        logging.info("Exception hitting url " + str(config) +
                     " with exception " + e.message)
Пример #7
0
async def hello_world():
    service = services.Chromedriver()
    browser = browsers.Chrome()
    async with get_session(service, browser) as session:
        await session.get('http://www.baidu.com/')
        search_box = await session.wait_for_element(5, '#kw')
        await search_box.send_keys('arsenic')
        await search_box.send_keys(keys.ENTER)
Пример #8
0
async def build_profile(args):
    scenarii = scenario[args.scenarii]

    # getting the latest archive from the server
    if TASK_CLUSTER:
        url = TC_LINK % args.scenarii
        basename = 'today-%s.tgz' % args.scenarii
    else:
        basename = '%s-latest.tar.gz' % args.scenarii
        url = args.archives_server + '/%s' % basename

    exists, headers = check_exists(url)
    metadata = {}

    if exists:
        target = os.path.join(args.archives_dir, basename)
        archive = download_file(url, target=target, check_file=False)
        with tarfile.open(archive, "r:gz") as tar:
            logger.msg("Checking the tarball content...")
            size = len(list(tar))
            with progress.Bar(expected_size=size) as bar:
                def _extract(self, *args, **kw):
                    if not TASK_CLUSTER:
                        bar.show(bar.last_progress + 1)
                    try:
                        return self.old(*args, **kw)
                    finally:
                        pass
                        # if args[0].name == ".hp.json":
                        #   import pdb; pdb.set_trace()

                tar.old = tar.extract
                tar.extract = functools.partial(_extract, tar)
                tar.extractall(args.profile)

    logger.msg("Updating profile located at %r" % args.profile)

    f_args = ["-profile", args.profile]
    if platform.system() != 'Darwin':
        f_args.append('-headless')

    caps = {"moz:firefoxOptions": {"args": f_args}}
    if args.firefox is not None:
        caps['moz:firefoxOptions']['binary'] = args.firefox

    logger.msg("Starting the Fox...")
    with open('gecko.log', 'a+') as glog:
        async with get_session(CustomGeckodriver(log_file=glog),
                               Firefox(**caps)) as session:
            metadata = await scenarii(session, args)

    # writing metadata
    logger.msg("Creating metadata...")
    metadata['name'] = args.scenarii
    with open(os.path.join(args.profile, '.hp.json'), 'w') as f:
        f.write(json.dumps(metadata))

    logger.msg("Done.")
Пример #9
0
    async def fetch_content(self, queue=queue):
        """Корутина для сбора данных со страницы вакансии.

        Корутина получает на вход очередь c данными о вакансиях.
        Назначение: асинхронно собрать информацию о вакансиях и записать ее в базу данных.
        Сначала метод создает таблицу в базе данных для записи информации на текущую дату. Далее запускает
        бесконечный цикл, в котором ожидает получения элемента(списка с данными) из очереди. Получив элемент,
        загружает по полученной из элемента ссылке страницу с информацией о вакансии. Со страницы асинхронно получаем
        название компании, требуемый опыт кандидата, тип занятости и полное описание вакансии. Полученные данные
        записываем в базу. Условием выхода из цикла является получение элемента None из очереди.
        Работа данного метода связана с работой метода get_links. Получив ссылку со страницы результатов поиска, метод
        get_links передает ссылку в метод fetch_content. Метод fetch_content начинает загразку страницы с вакансией и
        сбор информации. Не дождавшись завершения процесса и получив новый элемент из очереди, метод fetch_content
        открывает новую страницу по ссылке и начинает сбор данных с нее. Так продолжается пока в очереди есть элементы.
        Таким образом одновременно идет сбор информации со всех ссылок из списка, что позволяет в разы сократить время
        выполнения задачи по сбору данных.

        """
        await engine.execute(
            CreateTable(HeadHunter_db)
        )  # создаем таблицу для хранения данных о вакансии на текущую дату
        while True:
            item = await queue.get()
            if item is None:  # если получен элемент None значит очередь закончилась
                break
            async with get_session(self.service, self.browser) as web_session:
                await web_session.get(
                    item[0]
                )  # загражаем страницу по ссылке из списка и получаем объекты на странице
                company_object = await web_session.get_element(
                    'span[itemprop=name]')
                company = await company_object.get_text(
                )  # получаем название компании по текущей ссылке
                experience_object = await web_session.get_element(
                    'span[data-qa=vacancy-experience]')
                experience = await experience_object.get_text(
                )  # получаем требуемый опыт кандидата по текущей ссылке
                employment_mode_object = await web_session.get_element(
                    'p[data-qa=vacancy-view-employment-mode]')
                employment_mode = await employment_mode_object.get_text(
                )  # получаем тип занятости по текущей ссылке
                description_object = await web_session.get_element(
                    'div[data-qa=vacancy-description')
                description = await description_object.get_text(
                )  # получаем полное описание вакансии по текущей ссылке
                async with engine.connect() as conn:
                    async with conn.begin(
                    ) as trans:  # записываем данные в базу
                        await conn.execute(HeadHunter_db.insert().values(
                            link=item[0],
                            title=item[1],
                            salary=item[2],
                            responsibilites_short=item[3],
                            requirements_short=item[4],
                            company=company,
                            experience=experience,
                            employment_mode=employment_mode,
                            description=description))
Пример #10
0
async def scraper(url):
    service = services.Chromedriver()
    browser = browsers.Chrome(
        chromeOptions={'args': ['--headless', '--disable-gpu']})
    async with get_session(service, browser) as session:
        await session.get(url)
        body = await session.get_page_source()
        print(body)
        return body
Пример #11
0
async def hello_world():
    service = services.Geckodriver(binary=GECKODRIVER)
    browser = browsers.Firefox()
    async with get_session(service, browser) as session:
        await session.get("https://images.google.com/")
        search_box = await session.wait_for_element(5, "input[name=q]")
        await search_box.send_keys("Cats")
        await search_box.send_keys(keys.ENTER)
        await asyncio.sleep(10)
Пример #12
0
 async def _fetch_articles(self, urls):
     async with get_session(self.service, self.browser) as session:
         tasks = []
         for url in urls:
             tasks.append(
                 self._request_url(url, session)
             )
         htmls = await asyncio.gather(*tasks)
     return htmls
Пример #13
0
    async def crawl_news(self, news_site_url):
        async with get_session(self.service, self.browser) as session:
            html, _ = await self._request_url(news_site_url, session)

        tree = self._parse_etree_from_html(html)
        newslinks = self._parse_interesting_links_from_tree(tree)
        validated_newslinks = self._validate_links('https://yle.fi', newslinks)
        article_htmls = await self._fetch_articles(validated_newslinks)
        news_articles = self._parse_articles_from_htmls(article_htmls)
        return news_articles
async def create_source_selenium(url: str, proxy_list: list = None) -> str:
    service = services.Chromedriver(binary="./chromedriver")
    if proxy_list is not None:
        browser = browsers.Chrome(chromeOptions={
            'args':
            ['--headless', f"--proxy-server={random.choice(proxy_list)}"]
        })
    else:
        browser = browsers.Chrome(chromeOptions={'args': ['--headless']})
    async with get_session(service, browser) as session:
        await session.get(url)
        return await session.get_page_source()
async def scraper(url, i=-1, timeout=60, start=None, body_delay=10):
    service = services.Chromedriver()
    browser = browsers.Chrome(chromeOptions={
        'args': ['--headless', '--disable-gpu']
    })
    async with get_session(service, browser) as session:
        try:
            await asyncio.wait_for(session.get(url), timeout=timeout)
        except asyncio.TimeoutError:
            return []
        if body_delay > 0:
            await asyncio.sleep(body_delay)
        body = await session.get_page_source()
        return body
Пример #16
0
async def scraper(url: str) -> str:
    """Scrapes the HTML of the passed URL using arsenic webdriver."""

    service = services.Chromedriver()
    browser = browsers.Chrome()
    browser.capabilities = {
        'goog:chromeOptions': {
            'args': ['--headless', '--disable-gpu']
        }
    }

    async with get_session(service, browser) as session:
        await session.get(url)
        body = await session.get_page_source()
        return body
Пример #17
0
async def scraper(url: str):
    """Returns the HTML of the passed URL using arsenic webdriver."""

    service = services.Chromedriver()
    browser = browsers.Chrome()
    browser.capabilities = {
        'goog:chromeOptions': {
            'args': ['--headless', '--disable-gpu']
        }
    }

    # creating an arsenic session and running it inside of a context manager.
    async with get_session(service, browser) as session:
        await session.get(url)
        body = await session.get_page_source()
        return body
Пример #18
0
    async def search(self, quest):
        async with get_session(self.service, self.browser) as session:
            await session.get(self.link + quote(quest))
            await session.wait_for_element(3, '.sg-layout__box')

            source = await session.get_page_source()
            soup = BeautifulSoup(source, features="lxml")
            links = [
                link.get('href') for link in soup.find_all('a')
                if '/task/' in link.get('href')
            ]

            tasks = [links[n] for n in range(0, len(links), 2)]
            screen = await session.get_screenshot()

            await session.close()
            return tasks, self.crop(screen)
Пример #19
0
async def scraper_all(url):
    service = services.Chromedriver()
    browser = browsers.Chrome()
    browser.capabilities = {
        "goog:chromeOptions": {
            "args": ["--headless", "--disable-gpu"]
        }
    }
    async with get_session(service, browser) as session:
        await asyncio.wait_for(session.get(url), timeout=100)

        body = await session.get_page_source()
        soup = BeautifulSoup(body, 'html.parser')
        products = []
        box = soup.findAll("article", {"class": "serp-item list"})
        for l in box:
            try:
                link = l.find('a', href=True)['href']
                img = l.find('img')['src']
                name = l.find('h2').find('a').getText()
                time = l.find('time').getText()
                price = l.find('strong', {'class': 'item-price'}).getText()

            except:
                link = ''
                img = ''
                name = ''
                price = ''
                time = ''

            try:
                region = l.find('div', {
                    'class': 'content'
                }).findAll('p')[1].getText()
            except:
                region = ''

            products.append({
                'link': link,
                'img': img,
                'name': name,
                'price': price,
                'time': time,
                'region': region
            })
        return products
async def get_google_answer_text(url_text):
    msg = None
    service = services.Chromedriver()
    browser = browsers.Chrome(**{"goog:chromeOptions": CHROME_OPTIONS})
    try:
        async with get_session(service, browser) as session:
            await session.get(
                f"https://www.google.com/search?hl=en&gl=UK&q={url_text}")
            msg = await get_financial_box_text(session)
            if not msg:
                msg = await get_kp_box_text(session)
            if not msg:
                msg = await get_kc_box_text(session)
    except:
        msg = None
        traceback.print_exc()
    return msg
Пример #21
0
async def get_remote_session(root_url: str):
    if "REMOTE_BROWSER" not in os.environ:
        raise pytest.skip("No remote browser configured (REMOTE_BROWSER)")
    if "REMOTE_SERVICE" not in os.environ:
        raise pytest.skip("No remote service configured (REMOTE_SERVICE)")
    if "BROWSERSTACK_API_KEY" not in os.environ:
        raise pytest.skip(
            "No browserstack api key configured (BROWSERSTACK_API_KEY)")
    remote_browser = json.loads(os.environ["REMOTE_BROWSER"])
    browser_cls = getattr(browsers, remote_browser["browserName"])
    with bsl_context():
        async with get_session(
                services.Remote(url=os.environ["REMOTE_SERVICE"]),
                browser_cls(**remote_browser),
                root_url,
        ) as session:
            yield session
Пример #22
0
async def incredible(url):
    service = services.Chromedriver()
    browser = browsers.Chrome()
    browser.capabilities = {
        "goog:chromeOptions": {
            "args": ["--headless", "--disable-gpu"]
        }
    }
    async with get_session(service, browser) as session:
        await asyncio.wait_for(session.get(url), timeout=100)

        body = await session.get_page_source()
        soup = BeautifulSoup(body, 'html.parser')
        products = []
        box = soup.find("div",
                        {"class": "flash-product-wrapper bani-container"})
        li = box.findAll("a", {'class': 'flash-product'})
        for l in li:
            link = l['href']
            img = l.find('div', {'class': 'img-box'}).find('img')['src']
            name = l.find('p', {'class': 'f-p-name'}).getText()
            price = l.find('span', {'class': 'specific-price price'}).getText()
            brand = l.find('p', {'class': 'f-p-logo'}).getText()

            try:
                discount = l.find('div', {
                    'class': 'discount'
                }).find('p').getText()
                last_price = l.find('span', {
                    'class': 'old-price price'
                }).getText()
            except:
                discount = ''
                last_price = ''

            products.append({
                'link': link,
                'img': img,
                'name': name,
                'discount': discount,
                'last_price': last_price,
                'price': price,
                'brand': brand
            })
        return products
Пример #23
0
async def get_remote_session(root_url: str):
    if "REMOTE_BROWSER" not in os.environ:
        raise pytest.skip("No remote browser configured (REMOTE_BROWSER)")
    if "REMOTE_SERVICE" not in os.environ:
        raise pytest.skip("No remote service configured (REMOTE_SERVICE)")
    if "BROWSERSTACK_API_KEY" in os.environ:
        context = bsl_context
    else:
        context = null_context
    remote_browser = json.loads(os.environ["REMOTE_BROWSER"])
    browser_cls = getattr(browsers, remote_browser.pop("type"))
    with context():
        async with get_session(
                services.Remote(url=os.environ["REMOTE_SERVICE"]),
                browser_cls(**remote_browser),
                root_url,
        ) as session:
            yield session
Пример #24
0
	async def google(self, ctx, *, query):
		await ctx.channel.trigger_typing()
		await self.bot.loop.run_in_executor(None, func=functools.partial(self.assist, ctx.author.id, query))
		if ctx.author.id not in self.responses:
			return await ctx.send(f'<a:okaygoogle:661951491082551306> Something went wrong. Try again later')
		async with get_session(self.service, self.browser) as session:
			await session.set_window_size(1366, 768)
			sub = 'devapi' if self.bot.dev else 'api'
			await session.get(f'https://{sub}.gaminggeek.dev/assist/{ctx.author.id}')
			try:
				await session.execute_script('document.body.style.backgroundImage = \'url("https://picsum.photos/1366/768")\';')
			except Exception:
				pass
				# await ctx.error('script did an oopsie')
			await asyncio.sleep(1.5)
			await ctx.send(file=discord.File((await session.get_screenshot()), filename='google.png'))
			return await session.close()
		return await ctx.error('If you\'re seeing this, something went wrong I guess ¯\_(ツ)_/¯')
Пример #25
0
    async def fetch_content(self):
        """Корутина сбора данных со страницы вакансии.

        Корутина получает на вход очередь c данными о вакансиях.
        Назначение: асинхронно собрать информацию о вакансиях и записать ее в базу данных.
        Сначала метод создает таблицу в базе данных для записи информации на текущую дату. Далее запускает
        бесконечный цикл, в котором ожидает получения элемента(списка с данными) из очереди. Получив элемент,
        загружает по полученной из элемента ссылке страницу с информацией о вакансии. Со страницы асинхронно
        получает полное описание вакансии. Полученные данные(ссылка на вакансию, название вакансии, зарплата,
        требования к кандидату, название компнании, тип занятости, полное описание вакансии)записываем в базу.
        Условием выхода из цикла является получение элемента None из очереди.
        Работа данного метода связана с работой метода get_links. Получив ссылку со страницы результатов поиска, метод
        get_links передает ссылку в метод fetch_content. Метод fetch_content начинает загрузку страницы с вакансией и
        сбор информации. Не дождавшись завершения процесса и получив новый элемент из очереди, метод fetch_content
        открывает новую страницу по ссылке и начинает сбор данных с нее. Так продолжается пока в очереди есть элементы.
        Таким образом одновременно идет сбор информации со всех ссылок из списка, что позволяет в разы сократить время
        выполнения задачи по сбору данных.

        """
        await engine.execute(
            CreateTable(MoiKrug_db)
        )  # создаем таблицу для хранения данных о вакансии на текущую дату
        while True:
            item = await self.queue.get(
            )  # ждем пока появится новый элемент в очереди
            if item is None:  # Элемент None означает конец очереди
                break
            async with get_session(self.service, self.browser) as web_session:
                await web_session.get(item[0])  # загружаем страницу вакансии
                description_object = await web_session.get_element(
                    'div[class=vacancy_description]')
                description = await description_object.get_text(
                )  # получаем описание вакансии
                async with engine.connect() as conn:
                    async with conn.begin() as trans:
                        await conn.execute(MoiKrug_db.insert().values(
                            link=item[0],  # записываем данные в базу
                            title=item[1],
                            salary=item[2],
                            skills=item[3],
                            company=item[4],
                            occupation=item[5],
                            description=description))
Пример #26
0
async def build_profile(args):
    scenarii = scenario[args.scenarii]
    if not args.force_new:
        get_profile(args)
    logger.msg("Updating profile located at %r" % args.profile)
    metadata_file = os.path.join(args.profile, ".hp.json")

    with open(metadata_file) as f:
        metadata = json.loads(f.read())

    f_args = ["-profile", args.profile]
    if platform.system() != "Darwin":
        f_args.append("-headless")

    caps = {"moz:firefoxOptions": {"args": f_args}}
    if args.firefox is not None:
        caps["moz:firefoxOptions"]["binary"] = args.firefox

    logger.msg("Starting the Fox...")
    with open("gecko.log", "a+") as glog:
        async with get_session(CustomGeckodriver(log_file=glog),
                               Firefox(**caps)) as session:
            logger.msg("Running the %s scenario" % args.scenarii)
            metadata.update(await scenarii(session, args))

    # writing metadata
    logger.msg("Creating metadata...")
    ts = str(datetime.datetime.now())
    if "created" not in metadata:
        metadata["created"] = ts
    metadata["updated"] = ts
    metadata["name"] = args.scenarii
    metadata["platform"] = sys.platform
    metadata["age"] = get_age(metadata)
    metadata["version"] = "69.0a1"  # add the build id XXX
    metadata["customization"] = "vanilla"  # add themes

    with open(metadata_file, "w") as f:
        f.write(json.dumps(metadata))

    logger.msg("Profile at %s" % args.profile)
    logger.msg("Done.")
Пример #27
0
async def scraper(url, i=-1, timeout=60, start=None):
    service = services.Chromedriver()
    browser = browsers.Chrome(
        chromeOptions={'args': ['--headless', '--disable-gpu']})
    async with get_session(service, browser) as session:
        try:
            await asyncio.wait_for(session.get(url), timeout=timeout)
        except asyncio.TimeoutError:
            return []
        await asyncio.sleep(10)
        body = await session.get_page_source()  # save this locally??
        content = await get_parsable_html(body)
        links = await get_links(content)
        product_data = await get_product_data(url, content)
        if start != None:
            end = time.time() - start
            print(f'{i} took {end} seconds')
        # print(body)
        dataset = {"links": links, "product_data": product_data}
        return dataset
Пример #28
0
async def main():
    load_dotenv()

    service = services.Chromedriver(log_file=os.devnull)
    browser = browsers.Chrome()

    telegram_client = TelegramClient(token=os.getenv("TG_TOKEN"),
                                     default_channel=os.getenv("TG_CHANNEL"))
    loguru_client = LoguruClient()
    messenger = Messenger([loguru_client, telegram_client])

    async with get_session(service, browser) as session:
        extractor = CUExtractor(session)
        memory = Memory()

        while True:
            items = await extractor.extract()
            added: List[Entry] = memory.update(items)[0]
            for entry in added:
                messenger.send(entry.to_markdown())
            await asyncio.sleep(DELAY)
Пример #29
0
    async def qdocs(self, ctx, arg):
        if len(arg) > self.limit:
            return await ctx.send('`Query length greater than {self.limit}`')
        query_url = f'https://qiskit.org/documentation/search.html?q={arg}&check_keywords=yes&area=default#'

        service = services.Chromedriver()
        browser = browsers.Chrome()
        browser.capabilities = {
            "goog:chromeOptions": {
                "args": ["--headless", "--disable-gpu"]
            }
        }
        async with get_session(service, browser) as session:
            try:
                await session.get(query_url)
            except asyncio.TimeoutError:
                return await ctx.send('`Failed | Time Limit Exceeded`')
            else:
                source = None
                try:
                    source = await asyncio.wait_for(session.get_page_source(),
                                                    timeout=10)
                except asyncio.TimeoutError:
                    return await ctx.send('`Failed | Time Limit Exceeded`')
                else:
                    soup = BeautifulSoup(source, 'html.parser')
                    summary = soup.select('.search')
                    res = []
                    description = f''
                    for li in summary[0].find_all('li'):
                        link = li.find('a', href=True)
                        res.append(
                            f'[`{link.contents[0]}`]({self.render_link + link["href"]})'
                        )

                    embed = discord.Embed(title=f'`Results for: {arg}`',
                                          description='\n'.join(res),
                                          color=0xe8e3e3)

                    return await ctx.send(embed=embed)
Пример #30
0
async def scraper(url):
    service = services.Chromedriver()
    browser = browsers.Chrome()
    browser.capabilities = {
        "goog:chromeOptions": {"args": ["--headless", "--disable-gpu"]}
    }
    async with get_session(service, browser) as session:
        await asyncio.wait_for(session.get(url),timeout=100)
        
        body = await session.get_page_source()
        soup = BeautifulSoup(body, 'html.parser')
        products = []
        box = soup.find("ul", {"class":"category_styles_product_card_list__1Xocv"})
        li = box.findAll("li")
        for l in li:
            try:
                link = 'https://timcheh.com' + l.find ('a', href=True)['href']
                img = l.find('img')['src']
                name = l.find('h3').getText()
                price = l.find('div', {'class':'styles_price__cldWW'}).getText()
            except:
                link = ''
                img = ''
                name = ''
                price = ''
            try:
                discount = l.find('div', {'class':'styles_discount_number__39goM'}).find('span').getText()
                old_price = l.find('div', {'class':'styles_old_price__35bDJ'}).getText()
            except:
                discount = ''
                old_price = ''
            try:
                bonous = l.find('span', {'class':'styles_caption__3SE4x'}).getText()
            except:
                bonous = ''
            
            products.append({'link':link, 'img':img, 'name':name, 'discount':discount, 'last_price':old_price,
                             'price':price, 'bonous':bonous})
        return products