def __init__(self, manager, source_file): self.manager = manager self.source_file = source_file with io.open(source_file, encoding="utf-8") as fh: self.config_yaml = fh.read() self.config = yaml.safe_load(self.config_yaml) self.name = os.path.basename(source_file) # YAML keys with undefined values will be parsed as `None`. # eg: with the yaml definition `name: `, `config.get("name", "default_value")` # will evaluate to `None` instead of `default_value`. # So in order to avoid setting `self.name` to `None`, we use `or` to # set the default instead of passing it to `config.get()` self.name = self.config.get("name") or self.name self.validate_name() self.description = self.config.get("description") or self.name self.category = self.config.get("category") or "scrape" self.init_stage = self.config.get("init") or "init" self.delay = int(self.config.get("delay") or 0) self.expire = int(self.config.get("expire") or settings.EXPIRE) * 84600 self.stealthy = self.config.get("stealthy") or False self.queue = Dataset(conn, self.name) self.aggregator_config = self.config.get("aggregator") or {} self.stages = {} for name, stage in self.config.get("pipeline", {}).items(): self.stages[name] = CrawlerStage(self, name, stage)
def test_active_dataset_status(self): job = Job.create(self.conn, self.dataset) stage = job.get_stage("ingest") stage.queue({"test": "foo"}, {}) stage.queue({"test": "bar"}, {}) status = Dataset.get_active_dataset_status(self.conn) assert len(status["datasets"]) == 1 assert status["total"] == 1 assert status["datasets"]["test_1"]["pending"] == 2 job.dataset.cancel() status = Dataset.get_active_dataset_status(self.conn) assert status["datasets"] == {} assert status["total"] == 0
def test_active_dataset_status(self): job = Job.create(self.conn, self.dataset) stage = job.get_stage(Stage.INGEST) stage.queue({'test': 'foo'}, {}) stage.queue({'test': 'bar'}, {}) status = Dataset.get_active_dataset_status(self.conn) assert len(status['datasets']) == 1 assert status['total'] == 1 assert status['datasets']['test_1']['pending'] == 2 job.dataset.cancel() status = Dataset.get_active_dataset_status(self.conn) assert status['datasets'] == {} assert status['total'] == 0
def __init__(self, manager, source_file): self.manager = manager self.source_file = source_file with io.open(source_file, encoding="utf-8") as fh: self.config_yaml = fh.read() self.config = yaml.safe_load(self.config_yaml) self.name = os.path.basename(source_file) self.name = self.config.get("name", self.name) self.validate_name() self.description = self.config.get("description", self.name) self.category = self.config.get("category", "scrape") self.init_stage = self.config.get("init", "init") self.delay = int(self.config.get("delay", 0)) self.expire = int(self.config.get("expire", settings.EXPIRE)) * 84600 self.stealthy = self.config.get("stealthy", False) self.queue = Dataset(conn, self.name) self.aggregator_config = self.config.get("aggregator", {}) self.stages = {} for name, stage in self.config.get("pipeline", {}).items(): self.stages[name] = CrawlerStage(self, name, stage)
def __init__(self, manager, source_file): self.manager = manager self.source_file = source_file with io.open(source_file, encoding='utf-8') as fh: self.config_yaml = fh.read() self.config = yaml.safe_load(self.config_yaml) self.name = os.path.basename(source_file) self.name = self.config.get('name', self.name) self.description = self.config.get('description', self.name) self.category = self.config.get('category', 'scrape') self.schedule = self.config.get('schedule', 'disabled') self.init_stage = self.config.get('init', 'init') self.delta = Crawler.SCHEDULES.get(self.schedule) self.delay = int(self.config.get('delay', 0)) self.expire = int(self.config.get('expire', settings.EXPIRE)) * 84600 self.stealthy = self.config.get('stealthy', False) self.queue = Dataset(conn, self.name) self.aggregator_config = self.config.get('aggregator', {}) self.stages = {} for name, stage in self.config.get('pipeline', {}).items(): self.stages[name] = CrawlerStage(self, name, stage)
class Crawler(object): """A processing graph that constitutes a crawler.""" SCHEDULES = { "disabled": None, "hourly": timedelta(hours=1), "daily": timedelta(days=1), "weekly": timedelta(weeks=1), "monthly": timedelta(weeks=4), } def __init__(self, manager, source_file): self.manager = manager self.source_file = source_file with io.open(source_file, encoding="utf-8") as fh: self.config_yaml = fh.read() self.config = yaml.safe_load(self.config_yaml) self.name = os.path.basename(source_file) # YAML keys with undefined values will be parsed as `None`. # eg: with the yaml definition `name: `, `config.get("name", "default_value")` # will evaluate to `None` instead of `default_value`. # So in order to avoid setting `self.name` to `None`, we use `or` to # set the default instead of passing it to `config.get()` self.name = self.config.get("name") or self.name self.validate_name() self.description = self.config.get("description") or self.name self.category = self.config.get("category") or "scrape" self.init_stage = self.config.get("init") or "init" self.delay = int(self.config.get("delay") or 0) self.expire = int(self.config.get("expire") or settings.EXPIRE) * 84600 self.stealthy = self.config.get("stealthy") or False self.queue = Dataset(conn, self.name) self.aggregator_config = self.config.get("aggregator") or {} self.stages = {} for name, stage in self.config.get("pipeline", {}).items(): self.stages[name] = CrawlerStage(self, name, stage) def validate_name(self): if not re.match(r"^[A-Za-z0-9_-]+$", self.name): raise ValueError("Invalid crawler name: %s. " "Allowed characters: A-Za-z0-9_-" % self.name) @property def aggregator_method(self): if self.aggregator_config: method = self.aggregator_config.get("method") if not method: return # method A: via a named Python entry point func = get_entry_point("memorious.operations", method) if func is not None: return func # method B: direct import from a module if ":" in method: package, method = method.rsplit(":", 1) module = import_module(package) return getattr(module, method) raise ValueError("Unknown method: %s", self.method_name) def aggregate(self, context): if self.aggregator_method: log.info("Running aggregator for %s" % self.name) params = self.aggregator_config.get("params", {}) self.aggregator_method(context, params) def flush(self): """Delete all run-time data generated by this crawler.""" self.queue.cancel() Crawl.flush(self) self.flush_tags() def flush_tags(self): tags.delete(prefix=make_key(self, "tag")) def cancel(self): Crawl.abort_all(self) self.queue.cancel() def run(self, incremental=None, run_id=None): """Queue the execution of a particular crawler.""" state = { "crawler": self.name, "run_id": run_id or Job.random_id(), "incremental": settings.INCREMENTAL, "continue_on_error": settings.CONTINUE_ON_ERROR, } if incremental is not None: state["incremental"] = incremental # Cancel previous runs: self.cancel() init_stage = self.get(self.init_stage) Queue.queue(init_stage, state, {}) @property def is_running(self): """Is the crawler currently running?""" for job in self.queue.get_jobs(): if not job.is_done(): return True return False @property def last_run(self): return Crawl.last_run(self) @property def op_count(self): """Total operations performed for this crawler""" return Crawl.op_count(self) @property def runs(self): return Crawl.runs(self) @property def latest_runid(self): return Crawl.latest_runid(self) @property def pending(self): status = self.queue.get_status() return status.get("pending") def get(self, name): return self.stages.get(name) def __str__(self): return self.name def __iter__(self): return iter(self.stages.values()) def __repr__(self): return "<Crawler(%s)>" % self.name
class Crawler(object): """A processing graph that constitutes a crawler.""" SCHEDULES = { 'disabled': None, 'hourly': timedelta(hours=1), 'daily': timedelta(days=1), 'weekly': timedelta(weeks=1), 'monthly': timedelta(weeks=4) } def __init__(self, manager, source_file): self.manager = manager self.source_file = source_file with io.open(source_file, encoding='utf-8') as fh: self.config_yaml = fh.read() self.config = yaml.safe_load(self.config_yaml) self.name = os.path.basename(source_file) self.name = self.config.get('name', self.name) self.description = self.config.get('description', self.name) self.category = self.config.get('category', 'scrape') self.schedule = self.config.get('schedule', 'disabled') self.init_stage = self.config.get('init', 'init') self.delta = Crawler.SCHEDULES.get(self.schedule) self.delay = int(self.config.get('delay', 0)) self.expire = int(self.config.get('expire', settings.EXPIRE)) * 84600 self.stealthy = self.config.get('stealthy', False) self.queue = Dataset(conn, self.name) self.aggregator_config = self.config.get('aggregator', {}) self.stages = {} for name, stage in self.config.get('pipeline', {}).items(): self.stages[name] = CrawlerStage(self, name, stage) def check_due(self): """Check if the last execution of this crawler is older than the scheduled interval.""" if self.is_running: return False if self.delta is None: return False last_run = self.last_run if last_run is None: return True now = datetime.utcnow() if now > last_run + self.delta: return True return False @property def aggregator_method(self): if self.aggregator_config: method = self.aggregator_config.get("method") if not method: return if ':' in method: package, method = method.rsplit(':', 1) module = import_module(package) return getattr(module, method) def aggregate(self, context): if self.aggregator_method: log.info("Running aggregator for %s" % self.name) params = self.aggregator_config.get("params", {}) self.aggregator_method(context, params) def flush(self): """Delete all run-time data generated by this crawler.""" self.queue.cancel() Event.delete(self) Crawl.flush(self) def flush_events(self): Event.delete(self) def cancel(self): Crawl.abort_all(self) self.queue.cancel() @property def should_timeout(self): if self.last_run is None: return False now = datetime.utcnow() return self.last_run < now - timedelta( seconds=settings.CRAWLER_TIMEOUT) # noqa def timeout(self): log.warning("Crawler timed out: %s. Aggregator won't be run", self.name) # noqa self.cancel() def run(self, incremental=None, run_id=None): """Queue the execution of a particular crawler.""" state = { 'crawler': self.name, 'run_id': run_id or Job.random_id(), 'incremental': settings.INCREMENTAL } if incremental is not None: state['incremental'] = incremental # Cancel previous runs: self.cancel() # Flush out previous events data but keep the counts: Event.delete_data(self) Queue.queue(self.init_stage, state, {}) @property def is_running(self): """Is the crawler currently running?""" for job in self.queue.get_jobs(): if not job.is_done(): return True return False @property def last_run(self): return Crawl.last_run(self) @property def op_count(self): """Total operations performed for this crawler""" return Crawl.op_count(self) @property def runs(self): return Crawl.runs(self) @property def latest_runid(self): return Crawl.latest_runid(self) @property def pending(self): status = self.queue.get_status() return status.get('pending') def flush_tags(self): pipe = conn.pipeline() count = 0 for key in conn.scan_iter(make_key(self, 'tag', '*')): pipe.delete(key) count += 1 pipe.execute() log.info("Deleted %d tags", count) def get(self, name): return self.stages.get(name) def __str__(self): return self.name def __iter__(self): return iter(self.stages.values()) def __repr__(self): return '<Crawler(%s)>' % self.name
def cancel_queue(collection): Dataset(kv, collection.foreign_id).cancel()
def get_active_collection_status(): data = Dataset.get_active_dataset_status(kv) return data
def get_status(collection): return Dataset(kv, collection.foreign_id).get_status()
def cancel(dataset): """Delete scheduled tasks for given dataset""" conn = get_redis() Dataset(conn, dataset).cancel()
def cleanup_jobs(self): for dataset in Dataset.get_active_datasets(kv): for job in dataset.get_jobs(): self.cleanup_job(job)
def cancel_queue(collection): dataset = dataset_from_collection(collection) Dataset(kv, dataset).cancel()
def get_status(collection): dataset = dataset_from_collection(collection) return Dataset(kv, dataset).get_status()
class Crawler(object): """A processing graph that constitutes a crawler.""" SCHEDULES = { "disabled": None, "hourly": timedelta(hours=1), "daily": timedelta(days=1), "weekly": timedelta(weeks=1), "monthly": timedelta(weeks=4), } def __init__(self, manager, source_file): self.manager = manager self.source_file = source_file with io.open(source_file, encoding="utf-8") as fh: self.config_yaml = fh.read() self.config = yaml.safe_load(self.config_yaml) self.name = os.path.basename(source_file) self.name = self.config.get("name", self.name) self.validate_name() self.description = self.config.get("description", self.name) self.category = self.config.get("category", "scrape") self._schedule = self.config.get("schedule", "disabled") self.init_stage = self.config.get("init", "init") self.delta = Crawler.SCHEDULES.get(self.schedule) self.delay = int(self.config.get("delay", 0)) self.expire = int(self.config.get("expire", settings.EXPIRE)) * 84600 self.stealthy = self.config.get("stealthy", False) self.queue = Dataset(conn, self.name) self.aggregator_config = self.config.get("aggregator", {}) self.stages = {} for name, stage in self.config.get("pipeline", {}).items(): self.stages[name] = CrawlerStage(self, name, stage) def check_due(self): """Check if the last execution of this crawler is older than the scheduled interval.""" if self.is_running: return False if self.delta is None: return False last_run = self.last_run if last_run is None: return True now = datetime.utcnow() if now > last_run + self.delta: return True return False def validate_name(self): if not re.match(r"^[A-Za-z0-9_-]+$", self.name): raise ValueError("Invalid crawler name: %s. " "Allowed characters: A-Za-z0-9_-" % self.name) @property def schedule(self): schedule = Crawl.get_schedule(self) or self._schedule return schedule if schedule in self.SCHEDULES else "disabled" @property def aggregator_method(self): if self.aggregator_config: method = self.aggregator_config.get("method") if not method: return # method A: via a named Python entry point func = get_entry_point("memorious.operations", method) if func is not None: return func # method B: direct import from a module if ":" in method: package, method = method.rsplit(":", 1) module = import_module(package) return getattr(module, method) raise ValueError("Unknown method: %s", self.method_name) def aggregate(self, context): if self.aggregator_method: log.info("Running aggregator for %s" % self.name) params = self.aggregator_config.get("params", {}) self.aggregator_method(context, params) def flush(self): """Delete all run-time data generated by this crawler.""" self.queue.cancel() Event.delete(self) Crawl.flush(self) self.flush_tags() def flush_tags(self): tags.delete(prefix=make_key(self, "tag")) def flush_events(self): Event.delete(self) def cancel(self): Crawl.abort_all(self) self.queue.cancel() @property def should_timeout(self): if self.last_run is None: return False now = datetime.utcnow() return self.last_run < now - timedelta( seconds=settings.CRAWLER_TIMEOUT) # noqa def timeout(self): log.warning("Crawler timed out: %s. Aggregator won't be run", self.name) # noqa self.cancel() def run(self, incremental=None, run_id=None): """Queue the execution of a particular crawler.""" state = { "crawler": self.name, "run_id": run_id or Job.random_id(), "incremental": settings.INCREMENTAL, } if incremental is not None: state["incremental"] = incremental # Cancel previous runs: self.cancel() # Flush out previous events data but keep the counts: Event.delete_data(self) init_stage = self.get(self.init_stage) Queue.queue(init_stage, state, {}) @property def is_running(self): """Is the crawler currently running?""" for job in self.queue.get_jobs(): if not job.is_done(): return True return False @property def last_run(self): return Crawl.last_run(self) @property def op_count(self): """Total operations performed for this crawler""" return Crawl.op_count(self) @property def runs(self): return Crawl.runs(self) @property def latest_runid(self): return Crawl.latest_runid(self) @property def pending(self): status = self.queue.get_status() return status.get("pending") def get(self, name): return self.stages.get(name) def __str__(self): return self.name def __iter__(self): return iter(self.stages.values()) def __repr__(self): return "<Crawler(%s)>" % self.name