def _close(self): try: self.local_file.seek(0) with fsopen(self.kb_path, mode='wb') as kb_file: copy_file(self.local_file, kb_file) finally: self.local_file.close()
def _download_to_docs_or_figs( self, document=None, figure=None, src_records=(), only_new=False, ): if not document and not figure: raise TypeError( 'No document nor figure passed, at least one is needed.') is_document = bool(document) doc_or_fig_obj = self._resolve_doc_or_fig_url( doc_or_fig_obj=document or figure, src_records=src_records, only_new=only_new, ) if doc_or_fig_obj['url'].startswith('/api/files/'): return self.add_document_or_figure( metadata=doc_or_fig_obj, key=doc_or_fig_obj['key'], is_document=is_document, ) key = doc_or_fig_obj['key'] if key not in self.files: key = self._get_unique_files_key(base_file_name=key) stream = fsopen(doc_or_fig_obj['url'], mode='rb') return self.add_document_or_figure( metadata=doc_or_fig_obj, key=key, stream=stream, is_document=is_document, )
def file_opener(path, mode='r'): """File opener. param path (str): the fullpath of the file param mode (str): mode to open file file """ return fsopen(path, mode=mode)
def retrieve_uri(uri, outdir=None): """Retrieves the given uri and stores it in a temporary file.""" with tempfile.NamedTemporaryFile(prefix='inspire', dir=outdir) as local_file, \ fsopen(uri, mode='rb') as remote_file: copy_file(remote_file, local_file) local_file.flush() yield local_file.name
def retrieve_uri(uri, outdir=None): """Retrieves the given uri and stores it in a temporary file.""" local_file = tempfile.NamedTemporaryFile( prefix='inspire', dir=outdir, delete=False, ) try: with fsopen(uri, mode='rb') as remote_file: copy_file(remote_file, local_file) finally: local_file.close() return local_file.name
def _download_file_from_url(self, url): """Downloads file and calculates hash for it If everything is ok then adds it to files in current record. If file with same hash already found in db, tries to use this one instead of creating duplicate (uses `ObjectVersion.copy()` method) Args: url (str): Local or remote url/filepath Returns: str: key(sha-1) of downloaded file Raises: ValueError: can be raised in `self.hash_data` method if no data is provided Example: >>> self._download_file_from_url('http://example.com/url_to_file.pdf') '207611e7bf8a83f0739bb2e16a1a7cf0d585fb5f' """ stream = fsopen(url, mode="rb") # TODO: change to stream.read() when fs will be updated to >= 2.0 # As HTTPOpener is not working with size = -1 # (and read() method sets this size as default) # This is workaround until we will update to fs >2.0 data = stream._f.wrapped_file.read() key = self.hash_data(data=data) if key not in self.files.keys: file = self._find_local_file(key=key) new_key = None if file: LOGGER.debug("Same file found locally, trying to copy", uuid=self.id) try: new_key = self._copy_local_file(file, key) except ValueError: pass except AttributeError: pass if not new_key: LOGGER.debug("Adding file to record", key=key, uuid=self.id) self.files[key] = BytesIO(data) else: LOGGER.debug("File already attached to record", key=key, uuid=self.id) return key
def extract_world_archive(event, context, flog): flog.info('Starting world archive extraction...') bucket_name = event['bucket']['name'] object_key = event['object']['key'] flog.debug('Event object: %s::%s', bucket_name, object_key) # TODO: error handling api_key = os.path.splitext(os.path.split(object_key)[1])[0] world = World.select().where(World.api_key == api_key).get() user = world.user flog.info('Extracting for user::world: %s:%s', user.guid, world.guid) object_fd = fsopen('s3://{bucket}/{key}'.format( bucket=bucket_name, key=object_key, ), 'rb') archive_fs = ZipFS(object_fd, 'r') dest_fs = fsopendir('s3://{bucket}/'.format(bucket=bucket_name)) dest_prefix = 'worlds/{user_guid}/{world_guid}/'.format( user_guid=user.guid, world_guid=world.guid, ) for fn in archive_fs.walkfiles(wildcard='level.dat'): level_dat_fn = fn break flog.debug('Found level.dat at: %s', level_dat_fn) archive_fs = archive_fs.opendir(os.path.dirname(level_dat_fn)) flog.info('Extracting level.dat') # TODO: make sure these paths are actually safe dest_fs.setcontents( safe_path_join(dest_prefix, 'level.dat'), archive_fs.getcontents('level.dat')) for region_fn in archive_fs.walkfiles(wildcard='*.mca'): flog.info('Extracting file: %s', region_fn) dest_fs.setcontents( safe_path_join(dest_prefix, region_fn), archive_fs.getcontents(region_fn)) flog.info('Finished world archive extraction')
def render_region_heightmap(event, context, flog): src_bucket = event['bucket']['name'] src_obj_key = event['object']['key'] dest_vfs = fsopendir('s3://{bucket}/'.format(bucket='quarry-output')) dest_image_fn = os.path.join('heightmaps', *src_obj_key.split('/')[1:]) + '.png' src_region = RegionFile(fileobj=fsopen('s3://{bucket}/{key}'.format( bucket=src_bucket, key=src_obj_key), 'rb')) img = Image.new('L', (512, 512)) for chunk in src_region.get_metadata(): chunk_data = src_region.get_nbt(chunk.x, chunk.z) heightmap_data = numpy.array(chunk_data['Level']['HeightMap'], dtype=numpy.uint8).reshape((16, 16)) img.paste(Image.fromarray(heightmap_data), box=(chunk.x * 16, chunk.z * 16)) with dest_vfs.open(dest_image_fn, 'wb') as image_handle: img.save(image_handle, format='PNG')
def _download_to_docs_or_figs( self, document=None, figure=None, src_records=(), only_new=False, ): if not document and not figure: raise TypeError( 'No document nor figure passed, at least one is needed.' ) is_document = bool(document) doc_or_fig_obj = self._resolve_doc_or_fig_url( doc_or_fig_obj=document or figure, src_records=src_records, only_new=only_new, ) if doc_or_fig_obj['url'].startswith('/api/files/'): return self.add_document_or_figure( metadata=doc_or_fig_obj, key=doc_or_fig_obj['key'], is_document=is_document, ) key = doc_or_fig_obj['key'] if key not in self.files: key = self._get_unique_files_key(base_file_name=key) url = doc_or_fig_obj['url'] scheme = urlparse(url).scheme if scheme == 'file': url = unquote(url) stream = fsopen(url, mode='rb') return self.add_document_or_figure( metadata=doc_or_fig_obj, key=key, stream=stream, is_document=is_document, )
def upload(service): if 'files' in request.form: session['return_url'] = request.form['return_url'] files = request.form['files'] session['files_to_upload'] = files[2:-2].split("', '") filesystem = _build_file_system(service) files = session.pop('files_to_upload') from invenio.legacy.bibdocfile.api import bibdocfile_url_to_bibdocfile try: for one in files: docfile = bibdocfile_url_to_bibdocfile(one) f = fsopen(docfile.get_full_path(), 'r') n = filesystem.open(docfile.get_full_name(), "w") n.write(f.read()) n.close() flash("All files uploaded successfully", 'info') except: flash("Something went wrong, please try again", 'error') return redirect(session.pop('return_url'))
def copy_file_to_workflow(workflow, name, url): url = unquote(url) stream = fsopen(url, mode='rb') workflow.files[name] = stream return workflow.files[name]
def get_source(spec, cache_fs, account_accessor=None, clean=False, logger=None, cwd=None, callback=None): """ Download a file from a URL and return it wrapped in a row-generating acessor object. :param cwd: Current working directory, for relative file:: urls. :param spec: A SourceSpec that describes the source to fetch. :param cache_fs: A pyfilesystem filesystem to use for caching downloaded files. :param account_accessor: A callable to return the username and password to use for access FTP and S3 URLs. :param clean: Delete files in cache and re-download. :param logger: A logger, for logging. :param callback: A callback, called while reading files in download. signatire is f(read_len, total_len) :return: a SourceFile object. """ from fs.zipfs import ZipOpenError import os # FIXME. urltype should be moved to reftype. url_type = spec.get_urltype() def do_download(): return download(spec.url, cache_fs, account_accessor, clean=clean, logger=logger, callback=callback) if url_type == 'file': from fs.opener import fsopen syspath = spec.url.replace('file://','') cache_path = syspath.replace('/','_').strip('_') fs_path = os.path.join(cwd, syspath) contents = fsopen(fs_path).read() cache_fs.setcontents(cache_path, contents) elif url_type not in ('gs', 'socrata'): #FIXME. Need to clean up the logic for gs types. try: cache_path, download_time = do_download() spec.download_time = download_time except HTTPError as e: raise DownloadError("Failed to download {}; {}".format(spec.url, e)) else: cache_path, download_time = None, None if url_type == 'zip': try: fstor = extract_file_from_zip(cache_fs, cache_path, spec.url, spec.file) except ZipOpenError: # Try it again cache_fs.remove(cache_path) cache_path, spec.download_time = do_download() fstor = extract_file_from_zip(cache_fs, cache_path, spec.url, spec.file) file_type = spec.get_filetype(fstor.path) elif url_type == 'gs': fstor = get_gs(spec.url, spec.segment, account_accessor) file_type = 'gs' elif url_type == 'socrata': spec.encoding = 'utf8' spec.header_lines = [0] spec.start_line = 1 url = SocrataSource.download_url(spec) fstor = DelayedDownload(url, cache_fs) file_type = 'socrata' else: fstor = DelayedOpen(cache_fs, cache_path, 'rb') file_type = spec.get_filetype(fstor.path) spec.filetype = file_type TYPE_TO_SOURCE_MAP = { 'gs': GoogleSource, 'csv': CsvSource, 'tsv': TsvSource, 'fixed': FixedSource, 'txt': FixedSource, 'xls': ExcelSource, 'xlsx': ExcelSource, 'partition': PartitionSource, 'shape': ShapefileSource, 'socrata': SocrataSource } cls = TYPE_TO_SOURCE_MAP.get(file_type) if cls is None: raise SourceError( "Failed to determine file type for source '{}'; unknown type '{}' " .format(spec.name, file_type)) return cls(spec, fstor)
def _flush(self): with fsopen(self.kb_path, mode='wb') as fd: fd.write(''.join(self.data_buffer)) self.data_buffer = []