示例#1
0
def extract(context, data):
    """Extract a compressed file"""
    with context.http.rehash(data) as result:
        file_path = result.file_path
        content_type = result.content_type
        extract_dir = random_filename(context.work_path)
        if content_type in ZIP_MIME_TYPES:
            extracted_files = extract_zip(file_path, extract_dir, context)
        elif content_type in TAR_MIME_TYPES:
            extracted_files = extract_tar(file_path, extract_dir, context)
        elif content_type in SEVENZIP_MIME_TYPES:
            extracted_files = extract_7zip(file_path, extract_dir, context)
        else:
            context.log.warning(
                "Unsupported archive content type: %s", content_type
            )
            return
        extracted_content_hashes = {}
        for path in extracted_files:
            relative_path = os.path.relpath(path, extract_dir)
            content_hash = context.store_file(path)
            extracted_content_hashes[relative_path] = content_hash
            data['content_hash'] = content_hash
            data['file_name'] = relative_path
            context.emit(data=data.copy())
示例#2
0
def extract(context, data):
    """
    Extract a compressed file

    optional params in context:

        wildcards: only store extracted files matching these shell-style wildcards
    """
    with context.http.rehash(data) as result:
        file_path = result.file_path
        content_type = result.content_type
        extract_dir = random_filename(context.work_path)
        if content_type in ZIP_MIME_TYPES:
            extracted_files = extract_zip(file_path, extract_dir, context)
        elif content_type in TAR_MIME_TYPES:
            extracted_files = extract_tar(file_path, extract_dir, context)
        elif content_type in SEVENZIP_MIME_TYPES:
            extracted_files = extract_7zip(file_path, extract_dir, context)
        else:
            context.log.warning("Unsupported archive content type: %s",
                                content_type)
            return
        wildcards = ensure_list(context.params.get("wildcards")) or None
        for path in extracted_files:
            if wildcards is None or _test_fname(wildcards, path):
                relative_path = os.path.relpath(path, extract_dir)
                content_hash = context.store_file(path)
                data["content_hash"] = content_hash
                data["file_name"] = relative_path
                context.emit(data=data.copy())
示例#3
0
 def store_data(self, data, encoding='utf-8'):
     """Put the given content into a file, possibly encoding it as UTF-8
     in the process."""
     path = random_filename(self.work_path)
     try:
         with open(path, 'wb') as fh:
             if isinstance(data, str):
                 data = data.encode(encoding)
             if data is not None:
                 fh.write(data)
         return self.store_file(path)
     finally:
         try:
             os.unlink(path)
         except OSError:
             pass
示例#4
0
 def fetch(self):
     """Lazily trigger download of the data when requested."""
     if self._file_path is not None:
         return self._file_path
     temp_path = self.context.work_path
     if self._content_hash is not None:
         self._file_path = storage.load_file(self._content_hash,
                                             temp_path=temp_path)
         return self._file_path
     if self.response is not None:
         self._file_path = random_filename(temp_path)
         content_hash = sha1()
         with open(self._file_path, 'wb') as fh:
             for chunk in self.response.iter_content(chunk_size=8192):
                 content_hash.update(chunk)
                 fh.write(chunk)
         self._remove_file = True
         chash = content_hash.hexdigest()
         self._content_hash = storage.archive_file(self._file_path,
                                                   content_hash=chash)
         if self.http.cache and self.ok:
             self.context.set_tag(self.request_id, self.serialize())
         self.retrieved_at = datetime.utcnow().isoformat()
     return self._file_path