Python WebParser.get_content示例

编程语言: Python

命名空间/包名称: parser

类/类型: WebParser

方法/功能: get_content

hotexamples.com的示例: 4

Python WebParser.get_content - 已找到4个示例。这些是从开源项目中提取的最受好评的parser.WebParser.get_content现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

WebParser(8)

get_content(4)

content_liepin(1)

content_qiancheng(1)

content_zhilian(1)

list_liepin(1)

list_qiancheng(1)

list_zhilian(1)

示例#1

显示文件

文件： proxypool_scraper.py 项目： damklis/DataEngineeringProject

class ProxyPoolScraper:
    def __init__(self, url, bs_parser="lxml"):
        self.parser = WebParser(url)
        self.bs_parser = bs_parser

    def get_proxy_stream(self, limit):
        raw_records = self.extract_table_raw_records()
        clean_records = list(
            map(self._clear_up_record, raw_records)
        )
        for record in clean_records[:limit]:
            self.logger.info(f"Proxy record: {record}")
            if record:
                yield ProxyRecord(*record)

    def extract_table_raw_records(self):
        content = self.parser.get_content()
        soup_object = BeautifulSoup(content, self.bs_parser)
        return (
            soup_object
            .find(id="list")
            .find_all("tr")
        )

    def _clear_up_record(self, raw_record):
        return [
            val.text for val
            in raw_record.find_all("td")
        ]

示例#2

显示文件

文件： proxypool_validator.py 项目： szczeles/DataEngineeringProject

class ProxyPoolValidator:
    def __init__(self, url, timeout=10):
        self.timeout = timeout
        self.parser = WebParser(url, rotate_header=True)

    def validate_proxy(self, proxy_record):
        content = self.parser.get_content(timeout=self.timeout,
                                          proxies=proxy_record.proxy)
        proxy_status = ProxyStatus(proxy_record.proxy, content is not None)
        self.logger.info(f"Proxy status: {proxy_status}")
        return proxy_status

示例#3

显示文件

class NewsProducer:
    def __init__(self, rss_feed):
        self.parser = WebParser(rss_feed, rotate_header=True)
        self.formatter = NewsFormatter()

    def _extract_news_feed_items(self, proxies):
        content = self.parser.get_content(proxies=proxies)
        news_feed = atoma.parse_rss_bytes(content)
        return news_feed.items

    def get_news_stream(self, proxies):
        news_feed_items = self._extract_news_feed_items(proxies)
        for entry in news_feed_items:
            formatted_entry = self.formatter.format_entry(entry)
            yield formatted_entry

示例#4

显示文件

文件： proxypool_validator.py 项目： zkan/DataEngineeringProject

class ProxyPoolValidator:
    def __init__(self, url, timeout=10, checks=3, sleep_interval=0.1):
        self.timeout = timeout
        self.checks = checks
        self.sleep_interval = sleep_interval
        self.parser = WebParser(url, rotate_header=True)

    def validate_proxy(self, proxy_record):
        consecutive_checks = []
        for _ in range(self.checks):
            content = self.parser.get_content(timeout=self.timeout,
                                              proxies=proxy_record.proxy)
            time.sleep(self.sleep_interval)
            consecutive_checks.append(int(content is not None))

        health = sum(consecutive_checks) / self.checks
        proxy_status = ProxyStatus(proxy=proxy_record.proxy,
                                   health=health,
                                   is_valid=health > 0.66)
        self.logger.info(f"Proxy status: {proxy_status}")
        return proxy_status