示例#1
0
import urllib.parse
import urllib.robotparser as robotparser


try:
    from base import logger_setup
    from settings import CRAWL_DELAY
    import sitemap
except ImportError:
    from crawler.base import logger_setup
    from crawler.settings import CRAWL_DELAY
    import crawler.sitemap as sitemap


logger = logger_setup(__name__)


class Txt(robotparser.RobotFileParser):
    """
    Extention of robotparser, adds sitemap functionality, mainly a copy.

    Additions:
    - sitemaps
    - logging
    """

    def __init__(self, url, base_url):
        self.base_url = base_url
        self.sitemap = ()
        self.crawl_delay = CRAWL_DELAY
示例#2
0
    from settings import *
    import validate
    import webpage
    from webpage import remove_file
except ImportError:
    import crawler.base as base_
    from crawler.filequeue import Empty
    import crawler.model as model
    import crawler.robot as robot
    from crawler.settings import *
    import crawler.validate as validate
    import crawler.webpage as webpage
    from crawler.webpage import remove_file

logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG)
logger = base_.logger_setup(__name__)
logger.debug("NEW_CRAWL_RUN | " + dt.now().strftime('%H:%M | %d-%m-%Y |'))


ENCODINGS = ['utf_8', 'latin_1', 'utf_16', 'utf_16_be', 'utf_16_le', 'utf_32',
             'utf_32_be', 'utf_32_le', 'utf_7', 'base64_codec', 'big5',
             'big5hkscs', 'bz2_codec', 'cp037', 'cp1026', 'cp1125', 'cp1140',
             'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
             'cp1256', 'cp1257', 'cp1258', 'cp273', 'cp424', 'cp437',
             'cp500',  'cp775', 'cp850', 'cp852', 'cp855', 'cp857', 'cp858',
             'cp860', 'cp861', 'cp862', 'cp863', 'cp864', 'cp865', 'cp866',
             'cp869', 'cp932', 'cp949', 'cp950', 'euc_jis_2004', 'euc_jisx0213',
             'euc_jp', 'euc_kr', 'gb18030', 'gb2312', 'gbk', 'hex_codec',
             'hp_roman8', 'hz', 'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2',
             'iso2022_jp_2004', 'iso2022_jp_3', 'iso2022_jp_ext', 'iso2022_kr',
             'iso8859_10', 'iso8859_11', 'iso8859_13', 'iso8859_14',