def dispatch(cls, collection_id, meta): local_path = get_archive().load_file(meta) try: best_cls = cls.auction_file(meta, local_path) log.debug("Dispatching %r to %r", meta.file_name, best_cls) best_cls(collection_id).ingest(meta, local_path) CrawlerState.store_ok(meta, collection_id) db.session.commit() except Exception as exc: cls.handle_exception(meta, collection_id, exc) finally: get_archive().cleanup_file(meta)
def dispatch(cls, source_id, meta): local_path = get_archive().load_file(meta) try: best_cls = cls.auction_file(meta, local_path) log.debug("Dispatching %r to %r", meta.file_name, best_cls) best_cls(source_id).ingest(meta, local_path) CrawlerState.store_ok(meta, source_id) db.session.commit() except Exception as exception: cls.handle_exception(meta, source_id, exception) finally: get_archive().cleanup_file(meta)
def test_crawler_execute(self): tdc = TDocumentCrawler() ccnt = CrawlerState.all().count() assert ccnt == 0, ccnt tdc.execute() states = CrawlerState.all().all() assert len(states) == 2, len(states) demo = states[1] assert 'kitty' in demo.meta['title'], demo.meta assert 'demo.pdf' in demo.meta['source_path'], demo.meta coll = Collection.by_foreign_id('test') assert coll is not None, coll assert len(list(coll.documents)) == 1, list(coll.documents)
def handle_exception(cls, meta, source_id, exception): db.session.rollback() db.session.close() (error_type, error_message, error_details) = sys.exc_info() if error_type is not None: error_message = unicode(error_message) error_details = traceback.format_exc() else: error_message = unicode(exception) error_type = exception.__class__.__name__ log.warning(error_message) CrawlerState.store_fail(meta, source_id, error_type=error_type, error_message=error_message, error_details=error_details) db.session.commit()
def handle_exception(cls, meta, collection_id, exception): db.session.rollback() db.session.close() if isinstance(exception, SQLAlchemyError): log.exception(exception) return (error_type, error_message, error_details) = sys.exc_info() if error_type is not None: error_message = unicode(error_message) error_details = traceback.format_exc() else: error_message = unicode(exception) error_type = exception.__class__.__name__ log.warning(error_message) CrawlerState.store_fail(meta, collection_id, error_type=error_type, error_message=error_message, error_details=error_details) db.session.commit()
def to_dict(self): data = CrawlerState.crawler_stats(self.get_id()) data.update({ 'collection': self.collection, 'collection_id': self.COLLECTION_ID, 'collection_label': self.COLLECTION_LABEL or self.COLLECTION_ID, 'name': self.CRAWLER_NAME, 'schedule': self.SCHEDULE, 'id': self.get_id() }) return data
def to_dict(self): data = CrawlerState.crawler_stats(self.get_id()) data.update({ 'source': self.source, 'source_id': self.SOURCE_ID, 'source_label': self.SOURCE_LABEL or self.SOURCE_ID, 'name': self.CRAWLER_NAME, 'schedule': self.SCHEDULE, 'id': self.get_id() }) return data
def check_due(self, crawler_id): # should this be utcnow? _, last_run = CrawlerState.crawler_last_run(crawler_id) if last_run is None: return True now = datetime.now() if last_run > (now - CrawlerState.TIMEOUT): log.info("Crawler was active very recently. Skip due.") return False if now > last_run + self.delta: return True return False
def to_dict(self): data = CrawlerState.crawler_stats(self.get_id()) data.update({"name": self.CRAWLER_NAME, "schedule": self.SCHEDULE, "id": self.get_id()}) if self.COLLECTION_ID: data.update({"collection": self.collection, "collection_id": self.COLLECTION_ID}) return data
def execute(self, **kwargs): CrawlerState.store_stub(self.collection.id, self.get_id(), self.crawler_run) db.session.commit() super(DocumentCrawler, self).execute(**kwargs)
def execute(self, **kwargs): CrawlerState.store_stub(self.source.id, self.get_id(), self.crawler_run) db.session.commit() super(DocumentCrawler, self).execute(**kwargs)
def test_incremental(self): tdc = TDocumentCrawler() tdc.execute() tdc.execute(incremental=True) states = CrawlerState.all().all() assert len(states) == 3, len(states)