Пример #1
0
    def __init__(self):

        self.start_urls = [
            'https://dashboard.ed.gov/',
        ]

        extensions_to_avoid = []
        for ext in [
                h.get_data_extensions(),
                h.get_document_extensions(),
                h.get_avoidable_extensions()
        ]:
            extensions_to_avoid.extend(ext.keys())

        # Make rules
        self.rules = [
            Rule(LinkExtractor(
                allow=self.allowed_regex,
                deny_extensions=[ext[1:] for ext in extensions_to_avoid],
                process_value=self.process_value,
                unique=True,
                deny_domains=h.retrieve_crawlers_allowed_domains(
                    except_crawlers=['edgov'])),
                 callback=parse,
                 follow=True),
        ]

        # Inherit parent
        super(Crawler, self).__init__()
Пример #2
0
    def __init__(self):

        self.start_urls = [
            'https://www2.ed.gov/finaid/prof/resources/data/teach-institution.html',
            'https://www2.ed.gov/',
            'https://www2.ed.gov/about/offices/list/index.html'
        ]

        extensions_to_avoid = []
        for ext in [
                h.get_data_extensions(),
                h.get_document_extensions(),
                h.get_avoidable_extensions()
        ]:
            extensions_to_avoid.extend(ext.keys())

        # Make rules
        self.rules = [
            Rule(LinkExtractor(
                allow=self.allowed_regex,
                deny_extensions=[ext[1:] for ext in extensions_to_avoid],
                deny_domains=h.retrieve_crawlers_allowed_domains(
                    except_crawlers=['edgov'])),
                 callback=parse,
                 follow=True),
        ]

        # Inherit parent
        super(Crawler, self).__init__()
Пример #3
0
    def __init__(self):

        self.start_urls = [
            'http://rems.ed.gov/',
            'https://rems.ed.gov/REMSPublications.aspx',
            'https://rems.ed.gov/#resources',
        ]

        extensions_to_avoid = []
        for ext in [
                h.get_data_extensions(),
                h.get_document_extensions(),
                h.get_avoidable_extensions()
        ]:
            extensions_to_avoid.extend(ext.keys())

        # Make rules
        self.rules = [
            Rule(
                LinkExtractor(
                    #allow_domains=self.allowed_domains,
                    #allow=self.allowed_regex,
                    #deny_extensions=[ext[1:] for ext in extensions_to_avoid],
                    process_value=self.process_value,
                    unique=True,
                ),
                callback=parse,
                follow=True,
                process_links='process_links',
                process_request='process_request'),
        ]

        # Inherit parent
        super(Crawler, self).__init__()
Пример #4
0
def document_checker(tag_attr: str):
    """ function is used as a filter for BeautifulSoup to
    locate document files (i.e. DOCUMENT_EXTENSIONS) files"""

    if tag_attr != '' and tag_attr is not None:
        for extension in h.get_document_extensions().keys():
            if tag_attr.endswith(f'.{extension}'):
                return True
        # if code gets here, no resources found
        return False
    # tag_attr does not match resource required, so return False
    return False
Пример #5
0
    def __init__(self):

        self.start_urls = ['https://sites.ed.gov/']

        extensions_to_avoid = []
        for ext in [
                h.get_data_extensions(),
                h.get_document_extensions(),
                h.get_avoidable_extensions()
        ]:
            extensions_to_avoid.extend(ext.keys())

        # Make rules
        self.rules = [
            Rule(LinkExtractor(
                allow=self.allowed_regex,
                deny_extensions=[ext[1:] for ext in extensions_to_avoid],
            ),
                 callback=parse,
                 follow=True),
        ]

        # Inherit parent
        super(Crawler, self).__init__()