def __init__(self, settings): self.host = settings.get('HBASE_HOST') self.port = settings.get('HBASE_PORT') self.table = settings.get('HISTORY_TABLE') self.hbase = HbaseWrapper(self.host, self.port, self.table) column_families = (Hbase.ColumnDescriptor(name=self.column_family, maxVersions=1, timeToLive=86400), ) self.hbase.create_table_if_not_exists(column_families)
def __init__(self, stats, host, port, table): self.host = host self.port = port self.table = table self.created_at = datetime.datetime.now().strftime('%y%m%d') self.reversed_day = 991231 - int(self.created_at) self.hbase = HbaseWrapper(self.host, self.port, self.table) self.ctime = self.hbase.mutation(self.cf_basic, 'ctime', self.created_at) self.stats = stats self.invalid_items = []
class HousePipeline(object): cf_basic = 'basic' def __init__(self, stats, host, port, table): self.host = host self.port = port self.table = table self.created_at = datetime.datetime.now().strftime('%y%m%d') self.reversed_day = 991231 - int(self.created_at) self.hbase = HbaseWrapper(self.host, self.port, self.table) self.ctime = self.hbase.mutation(self.cf_basic, 'ctime', self.created_at) self.stats = stats self.invalid_items = [] def open_spider(self, spider): column_families = (Hbase.ColumnDescriptor(name=self.cf_basic, maxVersions=1), ) self.hbase.create_table_if_not_exists(column_families) def close_spider(self, spider): self.hbase.close()
def from_crawler(cls, crawler): # cls.http_proxies = crawler.settings.get('HTTP_PROXIES', False) # if not cls.http_proxies: # raise NotConfigured host = crawler.settings.get('HBASE_HOST') port = crawler.settings.get('HBASE_PORT') table = crawler.settings.get('PROXY_TABLE') # cls.stats = crawler.stats cls.hbase = HbaseWrapper(host, port, table) cls.mutex = thread.allocate_lock() cls.timeout = crawler.settings.get('PROXIES_TIMEOUT') cls.tscan = Hbase.TScan(columns=['cf:0'], caching=True, batchSize=20) cls._get_proxies() s = cls() crawler.signals.connect(s.spider_closed, signal=signals.spider_closed) return s
class UrlRecordMiddleware(object): column_family = 'cf' qualifier = '0' @classmethod def from_crawler(cls, crawler): s = cls(crawler.settings) crawler.signals.connect(s.spider_closed, signal=signals.spider_closed) return s def __init__(self, settings): self.host = settings.get('HBASE_HOST') self.port = settings.get('HBASE_PORT') self.table = settings.get('HISTORY_TABLE') self.hbase = HbaseWrapper(self.host, self.port, self.table) column_families = (Hbase.ColumnDescriptor(name=self.column_family, maxVersions=1, timeToLive=86400), ) self.hbase.create_table_if_not_exists(column_families) def spider_closed(self): self.hbase.close() def _record(self, row_key): mutations = (self.hbase.mutation(self.column_family, self.qualifier), ) self.hbase.put(row_key, mutations) def process_spider_input(self, response, spider): if response.meta.get('check_crawled', False) and 200 <= response.status < 300: spider.log('Recording <%s> into request history.' % response.url) m = hashlib.md5(response.url) if 'suffix' in response.meta: m.update(response.meta['suffix']) self._record(m.hexdigest()) return
class IgnoreRequestMiddleware(object): column_family = 'cf' qualifier = '0' def __init__(self, settings): self.host = settings.get('HBASE_HOST') self.port = settings.get('HBASE_PORT') self.table = settings.get('HISTORY_TABLE') self.hbase = HbaseWrapper(self.host, self.port, self.table) column_families = (Hbase.ColumnDescriptor(name=self.column_family, maxVersions=1, timeToLive=86400), ) self.hbase.create_table_if_not_exists(column_families) def _is_crawled(self, row_key): columns = (self.hbase.column(self.column_family, self.qualifier), ) rows = self.hbase.get(row_key, columns) return True if len(rows) > 0 else False @classmethod def from_crawler(cls, crawler): s = cls(crawler.settings) crawler.signals.connect(s.spider_closed, signal=signals.spider_closed) return s def spider_closed(self): self.hbase.close() def process_request(self, request, spider): if request.meta.get('check_crawled', False): spider.log('Checking history for <%s>.' % request.url) m = hashlib.md5(request.url) if 'suffix' in request.meta: m.update(request.meta['suffix']) if self._is_crawled(m.hexdigest()): spider.log('Request crawled: <%s>, ignore it.' % request.url) raise IgnoreRequest('request crawled')