def ingest_directory(collection_id, meta, local_path, base_path=None, move=False): """Ingest all the files in a directory.""" # This is somewhat hacky, see issue #55 for the rationale. if not os.path.exists(local_path): log.error("Invalid path: %r", local_path) return base_path = base_path or local_path if not os.path.isdir(local_path): child = meta.make_child() child.source_path = base_path return ingest_file(collection_id, child, local_path, move=move) # handle bundles claimed = [] for cls in get_ingestors(): if not hasattr(cls, 'bundle'): continue bundler = cls(collection_id) claimed.extend(bundler.bundle(meta, local_path)) # recurse downward into the directory: for entry in os.listdir(local_path): entry_path = os.path.join(local_path, string_value(entry)) entry_base = os.path.join(base_path, string_value(entry)) if entry in SKIP_ENTRIES or entry in claimed: log.debug("Ignore: %r", entry_base) continue log.info("Handle [%s]: %s", meta.crawler_run, entry_base) # We don't care if it is a file, this is handled at # the beginning anyway. ingest_directory(collection_id, meta, entry_path, base_path=entry_base, move=move)
def auction_file(cls, meta, local_path): best_score, best_cls = 0, None for cls in get_ingestors().values(): score = cls.match(meta, local_path) if score > best_score: best_score = score best_cls = cls return best_cls
def auction_file(cls, meta, local_path): best_score, best_cls = 0, None for cls in get_ingestors(): score = cls.match(meta, local_path) if score > best_score: best_score = score best_cls = cls if best_cls is None: raise IngestorException("No ingestor found: %r (%s, %s)" % (meta.file_name, meta.extension, meta.mime_type)) return best_cls
def auction_file(cls, meta, local_path): best_score, best_cls = 0, None for cls in get_ingestors(): score = cls.match(meta, local_path) if score > best_score: best_score = score best_cls = cls if best_cls is None: raise IngestorException( "No ingestor found: %r (%s, %s)" % (meta.file_name, meta.extension, meta.mime_type)) return best_cls
def dispatch(cls, source_id, meta): best_score, best_cls = 0, None local_path = archive.load_file(meta) try: for cls in get_ingestors().values(): score = cls.match(meta, local_path) if score > best_score: best_score = score best_cls = cls if best_cls is None: log.debug("No ingestor found for: %r", meta.file_name) return log.debug("Dispatching %r to %r", meta.file_name, best_cls.__name__) best_cls(source_id).ingest(meta, local_path) except Exception as ex: log.exception(ex) db.session.rollback() finally: archive.cleanup_file(meta)