示例#1
0
def ingest_directory(collection_id, meta, local_path, base_path=None,
                     move=False):
    """Ingest all the files in a directory."""
    # This is somewhat hacky, see issue #55 for the rationale.
    if not os.path.exists(local_path):
        log.error("Invalid path: %r", local_path)
        return

    base_path = base_path or local_path
    if not os.path.isdir(local_path):
        child = meta.make_child()
        child.source_path = base_path
        return ingest_file(collection_id, child, local_path, move=move)

    # handle bundles
    claimed = []
    for cls in get_ingestors():
        if not hasattr(cls, 'bundle'):
            continue
        bundler = cls(collection_id)
        claimed.extend(bundler.bundle(meta, local_path))

    # recurse downward into the directory:
    for entry in os.listdir(local_path):
        entry_path = os.path.join(local_path, string_value(entry))
        entry_base = os.path.join(base_path, string_value(entry))
        if entry in SKIP_ENTRIES or entry in claimed:
            log.debug("Ignore: %r", entry_base)
            continue
        log.info("Handle [%s]: %s", meta.crawler_run, entry_base)
        # We don't care if it is a file, this is handled at
        # the beginning anyway.
        ingest_directory(collection_id, meta, entry_path,
                         base_path=entry_base, move=move)
示例#2
0
def ingest_directory(collection_id, meta, local_path, base_path=None,
                     move=False):
    """Ingest all the files in a directory."""
    # This is somewhat hacky, see issue #55 for the rationale.
    if not os.path.exists(local_path):
        log.error("Invalid path: %r", local_path)
        return

    base_path = base_path or local_path
    if not os.path.isdir(local_path):
        child = meta.make_child()
        child.source_path = base_path
        return ingest_file(collection_id, child, local_path, move=move)

    # handle bundles
    claimed = []
    for cls in get_ingestors():
        if not hasattr(cls, 'bundle'):
            continue
        bundler = cls(collection_id)
        claimed.extend(bundler.bundle(meta, local_path))

    # recurse downward into the directory:
    for entry in os.listdir(local_path):
        entry_path = os.path.join(local_path, string_value(entry))
        entry_base = os.path.join(base_path, string_value(entry))
        if entry in SKIP_ENTRIES or entry in claimed:
            log.debug("Ignore: %r", entry_base)
            continue
        log.info("Handle [%s]: %s", meta.crawler_run, entry_base)
        # We don't care if it is a file, this is handled at
        # the beginning anyway.
        ingest_directory(collection_id, meta, entry_path,
                         base_path=entry_base, move=move)
示例#3
0
文件: ingestor.py 项目: stefanw/aleph
 def auction_file(cls, meta, local_path):
     best_score, best_cls = 0, None
     for cls in get_ingestors().values():
         score = cls.match(meta, local_path)
         if score > best_score:
             best_score = score
             best_cls = cls
     return best_cls
示例#4
0
 def auction_file(cls, meta, local_path):
     best_score, best_cls = 0, None
     for cls in get_ingestors().values():
         score = cls.match(meta, local_path)
         if score > best_score:
             best_score = score
             best_cls = cls
     return best_cls
示例#5
0
 def auction_file(cls, meta, local_path):
     best_score, best_cls = 0, None
     for cls in get_ingestors():
         score = cls.match(meta, local_path)
         if score > best_score:
             best_score = score
             best_cls = cls
     if best_cls is None:
         raise IngestorException("No ingestor found: %r (%s, %s)" %
                                 (meta.file_name, meta.extension,
                                  meta.mime_type))
     return best_cls
示例#6
0
 def auction_file(cls, meta, local_path):
     best_score, best_cls = 0, None
     for cls in get_ingestors():
         score = cls.match(meta, local_path)
         if score > best_score:
             best_score = score
             best_cls = cls
     if best_cls is None:
         raise IngestorException(
             "No ingestor found: %r (%s, %s)" %
             (meta.file_name, meta.extension, meta.mime_type))
     return best_cls
示例#7
0
 def dispatch(cls, source_id, meta):
     best_score, best_cls = 0, None
     local_path = archive.load_file(meta)
     try:
         for cls in get_ingestors().values():
             score = cls.match(meta, local_path)
             if score > best_score:
                 best_score = score
                 best_cls = cls
         if best_cls is None:
             log.debug("No ingestor found for: %r", meta.file_name)
             return
         log.debug("Dispatching %r to %r", meta.file_name,
                   best_cls.__name__)
         best_cls(source_id).ingest(meta, local_path)
     except Exception as ex:
         log.exception(ex)
         db.session.rollback()
     finally:
         archive.cleanup_file(meta)