Python Blacklist.factory示例

编程语言: Python

命名空间/包名称: blacklist

类/类型: Blacklist

方法/功能: factory

hotexamples.com的示例: 2

Python Blacklist.factory - 已找到2个示例。这些是从开源项目中提取的最受好评的blacklist.Blacklist.factory现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

Blacklist(16)

contains(5)

add(4)

factory(2)

add_blacklist(1)

add_blackname(1)

delete_blacklist(1)

get_blacklist(1)

get_data(1)

is_blacklisted(1)

load(1)

load_data(1)

rerenderBlackWhiteList(1)

retranslateUi(1)

save(1)

save_data(1)

示例#1

显示文件

def crawl(links):
    blacklist = Blacklist.factory("url", list(links))
    links_to_process = deque(blacklist.remove_blacklisted())
    email_blacklist = Blacklist(scrub_words=[
        'example', 'email', 'support', 'domain', 'orders', 'info', 'github',
        'registration', 'mozilla', 'donate', 'feedback', 'newsletter', 'name'
    ])
    email_writer = EmailWriter(email_blacklist)
    processed_urls = set()
    emails = set()

    logger = logging.getLogger()

    while len(links_to_process):
        url1 = links_to_process.pop()
        # add to processed immediately, to support failure
        processed_urls.add(url1)

        url_extras = get_url_extras(url1)

        response = get_url_response(url1)
        if not response.ok:
            continue

        try:
            new_emails = get_email_set_from_response(response)
        except TimeoutError:
            continue

        email_writer.add_emails(new_emails)

        # create a beautiful soup for the html document

        soup = BeautifulSoup(response.text, "html.parser")

        # find and process all the anchors in the document
        for anchor in soup.find_all("a"):
            # extract link url from the anchor
            link = anchor.attrs["href"] if "href" in anchor.attrs else ''
            # resolve relative links
            if link.startswith('/'):
                link = url_extras[1] + link
            elif not link.startswith('http'):
                link = url_extras[2] + link

            # add the new url to the queue if it was not enqueued nor processed yet
            if link not in links_to_process and link not in processed_urls:
                if not blacklist.is_blacklisted(link):
                    links_to_process.appendleft(link)

        # scrub linkset to ensure crawler doesn't waste time on one site
        # urls = scrub_linkset(urls)
        urls_list = list(links_to_process)
        scrubbed = scrub(urls_list, 4)
        logger.debug(scrubbed)
        links_to_process = deque(scrubbed)

    return emails

示例#2

显示文件

 def test_email_blacklist(self):
     blacklist = Blacklist.factory("emails")
     self.assertTrue(blacklist.is_blacklisted("*****@*****.**"))
     self.assertFalse(blacklist.is_blacklisted("*****@*****.**"))