示例#1
0
    def __init__(self, settings):
        self.host = settings.get('HBASE_HOST')
        self.port = settings.get('HBASE_PORT')
        self.table = settings.get('HISTORY_TABLE')

        self.hbase = HbaseWrapper(self.host, self.port, self.table)
        column_families = (Hbase.ColumnDescriptor(name=self.column_family,
                                                  maxVersions=1,
                                                  timeToLive=86400), )
        self.hbase.create_table_if_not_exists(column_families)
示例#2
0
 def __init__(self, stats, host, port, table):
     self.host = host
     self.port = port
     self.table = table
     self.created_at = datetime.datetime.now().strftime('%y%m%d')
     self.reversed_day = 991231 - int(self.created_at)
     self.hbase = HbaseWrapper(self.host, self.port, self.table)
     self.ctime = self.hbase.mutation(self.cf_basic, 'ctime',
                                      self.created_at)
     self.stats = stats
     self.invalid_items = []
示例#3
0
class HousePipeline(object):
    cf_basic = 'basic'

    def __init__(self, stats, host, port, table):
        self.host = host
        self.port = port
        self.table = table
        self.created_at = datetime.datetime.now().strftime('%y%m%d')
        self.reversed_day = 991231 - int(self.created_at)
        self.hbase = HbaseWrapper(self.host, self.port, self.table)
        self.ctime = self.hbase.mutation(self.cf_basic, 'ctime',
                                         self.created_at)
        self.stats = stats
        self.invalid_items = []

    def open_spider(self, spider):
        column_families = (Hbase.ColumnDescriptor(name=self.cf_basic,
                                                  maxVersions=1), )
        self.hbase.create_table_if_not_exists(column_families)

    def close_spider(self, spider):
        self.hbase.close()
示例#4
0
 def from_crawler(cls, crawler):
     # cls.http_proxies = crawler.settings.get('HTTP_PROXIES', False)
     # if not cls.http_proxies:
     #     raise NotConfigured
     host = crawler.settings.get('HBASE_HOST')
     port = crawler.settings.get('HBASE_PORT')
     table = crawler.settings.get('PROXY_TABLE')
     # cls.stats = crawler.stats
     cls.hbase = HbaseWrapper(host, port, table)
     cls.mutex = thread.allocate_lock()
     cls.timeout = crawler.settings.get('PROXIES_TIMEOUT')
     cls.tscan = Hbase.TScan(columns=['cf:0'], caching=True, batchSize=20)
     cls._get_proxies()
     s = cls()
     crawler.signals.connect(s.spider_closed, signal=signals.spider_closed)
     return s
示例#5
0
class UrlRecordMiddleware(object):
    column_family = 'cf'
    qualifier = '0'

    @classmethod
    def from_crawler(cls, crawler):
        s = cls(crawler.settings)
        crawler.signals.connect(s.spider_closed, signal=signals.spider_closed)
        return s

    def __init__(self, settings):
        self.host = settings.get('HBASE_HOST')
        self.port = settings.get('HBASE_PORT')
        self.table = settings.get('HISTORY_TABLE')

        self.hbase = HbaseWrapper(self.host, self.port, self.table)
        column_families = (Hbase.ColumnDescriptor(name=self.column_family,
                                                  maxVersions=1,
                                                  timeToLive=86400), )
        self.hbase.create_table_if_not_exists(column_families)

    def spider_closed(self):
        self.hbase.close()

    def _record(self, row_key):
        mutations = (self.hbase.mutation(self.column_family, self.qualifier), )
        self.hbase.put(row_key, mutations)

    def process_spider_input(self, response, spider):
        if response.meta.get('check_crawled',
                             False) and 200 <= response.status < 300:
            spider.log('Recording <%s> into request history.' % response.url)
            m = hashlib.md5(response.url)
            if 'suffix' in response.meta:
                m.update(response.meta['suffix'])
            self._record(m.hexdigest())
            return
示例#6
0
class IgnoreRequestMiddleware(object):
    column_family = 'cf'
    qualifier = '0'

    def __init__(self, settings):
        self.host = settings.get('HBASE_HOST')
        self.port = settings.get('HBASE_PORT')
        self.table = settings.get('HISTORY_TABLE')

        self.hbase = HbaseWrapper(self.host, self.port, self.table)
        column_families = (Hbase.ColumnDescriptor(name=self.column_family,
                                                  maxVersions=1,
                                                  timeToLive=86400), )
        self.hbase.create_table_if_not_exists(column_families)

    def _is_crawled(self, row_key):
        columns = (self.hbase.column(self.column_family, self.qualifier), )
        rows = self.hbase.get(row_key, columns)
        return True if len(rows) > 0 else False

    @classmethod
    def from_crawler(cls, crawler):
        s = cls(crawler.settings)
        crawler.signals.connect(s.spider_closed, signal=signals.spider_closed)
        return s

    def spider_closed(self):
        self.hbase.close()

    def process_request(self, request, spider):
        if request.meta.get('check_crawled', False):
            spider.log('Checking history for <%s>.' % request.url)
            m = hashlib.md5(request.url)
            if 'suffix' in request.meta:
                m.update(request.meta['suffix'])
            if self._is_crawled(m.hexdigest()):
                spider.log('Request crawled: <%s>, ignore it.' % request.url)
                raise IgnoreRequest('request crawled')