def download_data(self, format="", previous_export=None, filter=None): """ If there is data, return an HTTPResponse with the appropriate data. If there is not data returns None. """ from couchexport.shortcuts import export_response from couchexport.export import get_writer, get_schema_new, format_tables, create_intermediate_tables if not format: format = self.default_format or Format.XLS_2007 from couchexport.export import ExportConfiguration database = get_db() config = ExportConfiguration(database, self.index, previous_export, util.intersect_filters(self.filter, filter)) # get and checkpoint the latest schema updated_schema = get_schema_new(config) export_schema_checkpoint = ExportSchema( seq=config.current_seq, schema=updated_schema, index=config.schema_index ) export_schema_checkpoint.save() # transform docs onto output and save writer = get_writer(format) # open the doc and the headers formatted_headers = self.get_table_headers() tmp = StringIO() writer.open(formatted_headers, tmp) for doc in config.get_docs(): writer.write(self.trim(format_tables(create_intermediate_tables(doc, updated_schema), separator="."))) writer.close() return export_response(tmp, format, self.name)
def get_export_files(self, format='', previous_export_id=None, filter=None, use_cache=True, max_column_size=2000, separator='|', process=None, **kwargs): # the APIs of how these methods are broken down suck, but at least # it's DRY from couchexport.export import get_writer, get_export_components, get_headers, get_formatted_rows from django.core.cache import cache import hashlib export_tag = self.index CACHE_TIME = 1 * 60 * 60 # cache for 1 hour, in seconds def _build_cache_key(tag, prev_export_id, format, max_column_size): def _human_readable_key(tag, prev_export_id, format, max_column_size): return "couchexport_:%s:%s:%s:%s" % (tag, prev_export_id, format, max_column_size) return hashlib.md5(_human_readable_key(tag, prev_export_id, format, max_column_size)).hexdigest() # check cache, only supported for filterless queries, currently cache_key = _build_cache_key(export_tag, previous_export_id, format, max_column_size) if use_cache and filter is None: cached_data = cache.get(cache_key) if cached_data: (tmp, checkpoint) = cached_data return ExportFiles(tmp, checkpoint) fd, path = tempfile.mkstemp() with os.fdopen(fd, 'wb') as tmp: schema_index = export_tag config, updated_schema, export_schema_checkpoint = get_export_components(schema_index, previous_export_id, filter) if config: writer = get_writer(format) # get cleaned up headers formatted_headers = self.remap_tables(get_headers(updated_schema, separator=separator)) writer.open(formatted_headers, tmp, max_column_size=max_column_size) total_docs = len(config.potentially_relevant_ids) if process: DownloadBase.set_progress(process, 0, total_docs) for i, doc in config.enum_docs(): if self.transform: doc = self.transform(doc) writer.write(self.remap_tables(get_formatted_rows( doc, updated_schema, include_headers=False, separator=separator))) if process: DownloadBase.set_progress(process, i + 1, total_docs) writer.close() checkpoint = export_schema_checkpoint if checkpoint: if use_cache: cache.set(cache_key, (path, checkpoint), CACHE_TIME) return ExportFiles(path, checkpoint) return None
def generate_bulk_file(self): configs = list() schemas = list() checkpoints = list() for export_object in self.export_objects: config, schema, checkpoint = export_object.get_export_components(filter=self.export_filter) configs.append(config) schemas.append(schema) checkpoints.append(checkpoint) writer = get_writer(self.format) # generate the headers for the bulk excel file headers = self.generate_table_headers(schemas, checkpoints) fd, path = tempfile.mkstemp() with os.fdopen(fd, 'wb') as tmp: writer.open(headers, tmp) # now that the headers are set, lets build the rows for i, config in enumerate(configs): for doc in config.get_docs(): if self.export_objects[i].transform: doc = self.export_objects[i].transform(doc) table = format_tables(create_intermediate_tables(doc, schemas[i]), include_headers=isinstance(self, CustomBulkExport), separator=self.separator) if isinstance(self, CustomBulkExport): table = self.export_objects[i].trim(table, doc) table = self.export_objects[i].parse_tables(table) writer.write(table) writer.close() return path
def _get_writer(export_instances): """ Return a new _Writer """ format = Format.XLS_2007 if len(export_instances) == 1: format = export_instances[0].export_format legacy_writer = get_writer(format) writer = _Writer(legacy_writer) return writer
def generate_bulk_file(self, update_progress=None): update_progress = update_progress or (lambda x: x) configs = list() schemas = list() checkpoints = list() for export_object in self.export_objects: config, schema, checkpoint = export_object.get_export_components(filter=self.export_filter) configs.append(config) schemas.append(schema) checkpoints.append(checkpoint) writer = get_writer(self.format) # generate the headers for the bulk excel file headers = self.generate_table_headers(schemas, checkpoints) fd, path = tempfile.mkstemp() with os.fdopen(fd, 'wb') as tmp: writer.open(headers, tmp) # now that the headers are set, lets build the rows for i, config in enumerate(configs): try: for doc in config.get_docs(): if self.export_objects[i].transform: doc = self.export_objects[i].transform(doc) table = get_formatted_rows( doc, schemas[i], separator=self.separator, include_headers=isinstance(self, CustomBulkExport)) if isinstance(self, CustomBulkExport): table = self.export_objects[i].trim(table, doc) if table and table[0]: # if an export only contains data from repeats and a form has no repeats # then the table list will be empty table = self.export_objects[i].parse_tables(table) writer.write(table) except SchemaMismatchException: # fire off a delayed force update to prevent this from happening again rebuild_schemas.delay(self.export_objects[i].index) writer.write( [(self.export_objects[i].table_name, [ FormattedRow([ ugettext( 'There was an error generating this export. ' 'If the problem persists please report an issue.' )], separator=self.separator) ])] ) update_progress(i+1) writer.close() return path
def get_export_writer(export_instances, temp_path, allow_pagination=True): """ Return a new _Writer """ format = Format.XLS_2007 if len(export_instances) == 1: format = export_instances[0].export_format legacy_writer = get_writer(format) if allow_pagination and PAGINATED_EXPORTS.enabled(export_instances[0].domain): writer = _PaginatedExportWriter(legacy_writer, temp_path) else: writer = _ExportWriter(legacy_writer, temp_path) return writer
def get_export_writer(export_instances, allow_pagination=True): """ Return a new _Writer """ format = Format.XLS_2007 if len(export_instances) == 1: format = export_instances[0].export_format legacy_writer = get_writer(format) if allow_pagination and PAGINATED_EXPORTS.enabled( export_instances[0].domain): writer = _PaginatedExportWriter(legacy_writer) else: writer = _ExportWriter(legacy_writer) return writer
def get_export_files(self, format=None, previous_export=None, filter=None, process=None, max_column_size=None, apply_transforms=True, limit=0, **kwargs): from couchexport.export import get_writer, get_formatted_rows if not format: format = self.default_format or Format.XLS_2007 config, updated_schema, export_schema_checkpoint = self.get_export_components(previous_export, filter) # transform docs onto output and save writer = get_writer(format) # open the doc and the headers formatted_headers = list(self.get_table_headers()) fd, path = tempfile.mkstemp() with os.fdopen(fd, 'wb') as tmp: writer.open( formatted_headers, tmp, max_column_size=max_column_size, table_titles=dict([ (table.index, table.display) for table in self.tables if table.display ]) ) total_docs = len(config.potentially_relevant_ids) if process: DownloadBase.set_progress(process, 0, total_docs) for i, doc in config.enum_docs(): if limit and i > limit: break if self.transform and apply_transforms: doc = self.transform(doc) formatted_tables = self.trim( get_formatted_rows(doc, updated_schema, separator="."), doc, apply_transforms=apply_transforms ) writer.write(formatted_tables) if process: DownloadBase.set_progress(process, i + 1, total_docs) writer.close() if format == Format.PYTHON_DICT: return writer.get_preview() return ExportFiles(path, export_schema_checkpoint, format)
def get_export_files(self, format=None, previous_export=None, filter=None, process=None, max_column_size=None, apply_transforms=True, limit=0, **kwargs): from couchexport.export import get_writer, format_tables, create_intermediate_tables if not format: format = self.default_format or Format.XLS_2007 config, updated_schema, export_schema_checkpoint = self.get_export_components(previous_export, filter) # transform docs onto output and save writer = get_writer(format) # open the doc and the headers formatted_headers = list(self.get_table_headers()) fd, path = tempfile.mkstemp() with os.fdopen(fd, 'wb') as tmp: writer.open( formatted_headers, tmp, max_column_size=max_column_size, table_titles=dict([ (table.index, table.display) for table in self.tables if table.display ]) ) total_docs = len(config.potentially_relevant_ids) if process: DownloadBase.set_progress(process, 0, total_docs) for i, doc in config.enum_docs(): if limit and i > limit: break if self.transform and apply_transforms: doc = self.transform(doc) formatted_tables = self.trim( format_tables( create_intermediate_tables(doc, updated_schema), separator="." ), doc, apply_transforms=apply_transforms ) writer.write(formatted_tables) if process: DownloadBase.set_progress(process, i + 1, total_docs) writer.close() return ExportFiles(path, export_schema_checkpoint, format)
def get_export_writer(export_instances, temp_path, allow_pagination=True): """ Return a new _Writer """ format = Format.XLS_2007 format_data_in_excel = False if len(export_instances) == 1: format = export_instances[0].export_format format_data_in_excel = export_instances[0].format_data_in_excel legacy_writer = get_writer(format, use_formatted_cells=format_data_in_excel) if allow_pagination and PAGINATED_EXPORTS.enabled(export_instances[0].domain): writer = _PaginatedExportWriter(legacy_writer, temp_path) else: writer = _ExportWriter(legacy_writer, temp_path) return writer
def __init__(self, export_instance, total_docs, num_processes): self.export_instance = export_instance self.results = [] self.progress_queue = multiprocessing.Queue() self.progress = multiprocessing.Process(target=_output_progress, args=(self.progress_queue, total_docs)) self.export_function = run_export_with_logging def _set_queue(queue): """Set the progress queue as an attribute on the function You can't pass this as an arg""" self.export_function.queue = queue self.pool = multiprocessing.Pool(processes=num_processes, initializer=_set_queue, initargs=[self.progress_queue]) self.is_zip = isinstance(get_writer(export_instance.export_format), ZippedExportWriter) self.premature_exit = False
def generate_bulk_file(self): configs = list() schemas = list() checkpoints = list() for export_object in self.export_objects: config, schema, checkpoint = export_object.get_export_components( filter=self.export_filter) configs.append(config) schemas.append(schema) checkpoints.append(checkpoint) writer = get_writer(self.format) # generate the headers for the bulk excel file headers = self.generate_table_headers(schemas, checkpoints) fd, path = tempfile.mkstemp() with os.fdopen(fd, 'wb') as tmp: writer.open(headers, tmp) # now that the headers are set, lets build the rows for i, config in enumerate(configs): for doc in config.get_docs(): if self.export_objects[i].transform: doc = self.export_objects[i].transform(doc) table = format_tables( create_intermediate_tables(doc, schemas[i]), include_headers=isinstance(self, CustomBulkExport), separator=self.separator) if isinstance(self, CustomBulkExport): table = self.export_objects[i].trim(table, doc) table = self.export_objects[i].parse_tables(table) writer.write(table) writer.close() return path
def __init__(self, file, format): self._headers = {} self.writer = get_writer(format.slug) self.file = file self.writer.open((), file)
def test_multiple_write_export_instance_calls(self): """ Confirm that calling _write_export_instance() multiple times (as part of a bulk export) works as expected. """ export_instances = [ ExportInstance( # export_format=Format.JSON, tables=[ TableConfiguration( label="My table", path=[], columns=[ ExportColumn( label="Q3", item=ScalarItem( path=['form', 'q3'], ), selected=True, ), ] ), ] ), ExportInstance( # export_format=Format.JSON, tables=[ TableConfiguration( label="My other table", path=['form', 'q2'], columns=[ ExportColumn( label="Q4", item=ScalarItem( path=['form', 'q2', 'q4'], ), selected=True, ), ] ) ] ) ] writer = _Writer(get_writer(Format.JSON)) with writer.open(_get_tables(export_instances)): _write_export_instance(writer, export_instances[0], self.docs) _write_export_instance(writer, export_instances[1], self.docs) with ExportFile(writer.path, writer.format) as export: self.assertEqual( json.loads(export), { u'My table': { u'headers': [u'Q3'], u'rows': [[u'baz'], [u'bop']], }, u'My other table': { u'headers': [u'Q4'], u'rows': [[u'bar'], [u'boop']], } } )
def test_multiple_write_export_instance_calls(self): """ Confirm that calling _write_export_instance() multiple times (as part of a bulk export) works as expected. """ export_instances = [ FormExportInstance( # export_format=Format.JSON, tables=[ TableConfiguration( label="My table", selected=True, path=[], columns=[ ExportColumn( label="Q3", item=ScalarItem( path=[PathNode(name='form'), PathNode(name='q3')], ), selected=True, ), ] ), ] ), FormExportInstance( # export_format=Format.JSON, tables=[ TableConfiguration( label="My other table", selected=True, path=[PathNode(name="form", is_repeat=False), PathNode(name="q2", is_repeat=False)], columns=[ ExportColumn( label="Q4", item=ScalarItem( path=[PathNode(name='form'), PathNode(name='q2'), PathNode(name='q4')], ), selected=True, ), ] ) ] ) ] writer = _Writer(get_writer(Format.JSON)) with writer.open(export_instances): _write_export_instance(writer, export_instances[0], self.docs) _write_export_instance(writer, export_instances[1], self.docs) with ExportFile(writer.path, writer.format) as export: self.assertEqual( json.loads(export.read()), { u'Export1-My table': { u'headers': [u'Q3'], u'rows': [[u'baz'], [u'bop']], }, u'Export2-My other table': { u'headers': [u'Q4'], u'rows': [[u'bar'], [u'boop']], } } )
def test_multiple_write_export_instance_calls(self, export_save): """ Confirm that calling _write_export_instance() multiple times (as part of a bulk export) works as expected. """ export_instances = [ FormExportInstance(tables=[ TableConfiguration(label="My table", selected=True, path=[], columns=[ ExportColumn( label="Q3", item=ScalarItem(path=[ PathNode(name='form'), PathNode(name='q3') ], ), selected=True, ), ]), ]), FormExportInstance(tables=[ TableConfiguration(label="My other table", selected=True, path=[ PathNode(name="form", is_repeat=False), PathNode(name="q2", is_repeat=False) ], columns=[ ExportColumn( label="Q4", item=ScalarItem(path=[ PathNode(name='form'), PathNode(name='q2'), PathNode(name='q4') ], ), selected=True, ), ]) ]), FormExportInstance(tables=[ TableConfiguration(label="My other table", selected=True, path=[ PathNode(name="form", is_repeat=False), PathNode(name="q2", is_repeat=False) ], columns=[ ExportColumn( label="Q4", item=ScalarItem(path=[ PathNode(name='form'), PathNode(name='q2'), PathNode(name='q4') ], ), selected=True, ), ]) ]) ] with TransientTempfile() as temp_path: writer = _ExportWriter(get_writer(Format.JSON), temp_path) with writer.open(export_instances): write_export_instance(writer, export_instances[0], self.docs) write_export_instance(writer, export_instances[1], self.docs) write_export_instance(writer, export_instances[2], self.docs) with ExportFile(writer.path, writer.format) as export: self.assertEqual( json.loads(export.read()), { 'My table': { 'headers': ['Q3'], 'rows': [['baz'], ['bop']], }, 'Export2-My other table': { 'headers': ['Q4'], 'rows': [['bar'], ['boop']], }, 'Export3-My other table': { 'headers': ['Q4'], 'rows': [['bar'], ['boop']], }, }) self.assertTrue(export_save.called)
def test_multiple_write_export_instance_calls(self, export_save): """ Confirm that calling _write_export_instance() multiple times (as part of a bulk export) works as expected. """ export_instances = [ FormExportInstance( tables=[ TableConfiguration( label="My table", selected=True, path=[], columns=[ ExportColumn( label="Q3", item=ScalarItem( path=[PathNode(name='form'), PathNode(name='q3')], ), selected=True, ), ] ), ] ), FormExportInstance( tables=[ TableConfiguration( label="My other table", selected=True, path=[PathNode(name="form", is_repeat=False), PathNode(name="q2", is_repeat=False)], columns=[ ExportColumn( label="Q4", item=ScalarItem( path=[PathNode(name='form'), PathNode(name='q2'), PathNode(name='q4')], ), selected=True, ), ] ) ] ), FormExportInstance( tables=[ TableConfiguration( label="My other table", selected=True, path=[PathNode(name="form", is_repeat=False), PathNode(name="q2", is_repeat=False)], columns=[ ExportColumn( label="Q4", item=ScalarItem( path=[PathNode(name='form'), PathNode(name='q2'), PathNode(name='q4')], ), selected=True, ), ] ) ] ) ] with TransientTempfile() as temp_path: writer = _ExportWriter(get_writer(Format.JSON), temp_path) with writer.open(export_instances): write_export_instance(writer, export_instances[0], self.docs) write_export_instance(writer, export_instances[1], self.docs) write_export_instance(writer, export_instances[2], self.docs) with ExportFile(writer.path, writer.format) as export: self.assertEqual( json.loads(export.read()), { 'My table': { 'headers': ['Q3'], 'rows': [['baz'], ['bop']], }, 'Export2-My other table': { 'headers': ['Q4'], 'rows': [['bar'], ['boop']], }, 'Export3-My other table': { 'headers': ['Q4'], 'rows': [['bar'], ['boop']], }, } ) self.assertTrue(export_save.called)
def get_export_files(self, format='', previous_export_id=None, filter=None, use_cache=True, max_column_size=2000, separator='|', process=None, **kwargs): # the APIs of how these methods are broken down suck, but at least # it's DRY from couchexport.export import get_writer, get_export_components, get_headers, get_formatted_rows from django.core.cache import cache import hashlib export_tag = self.index CACHE_TIME = 1 * 60 * 60 # cache for 1 hour, in seconds def _build_cache_key(tag, prev_export_id, format, max_column_size): def _human_readable_key(tag, prev_export_id, format, max_column_size): return "couchexport_:%s:%s:%s:%s" % (tag, prev_export_id, format, max_column_size) return hashlib.md5( _human_readable_key( tag, prev_export_id, format, max_column_size).encode('utf-8')).hexdigest() # check cache, only supported for filterless queries, currently cache_key = _build_cache_key(export_tag, previous_export_id, format, max_column_size) if use_cache and filter is None: cached_data = cache.get(cache_key) if cached_data: (tmp, checkpoint) = cached_data return ExportFiles(tmp, checkpoint) fd, path = tempfile.mkstemp() with os.fdopen(fd, 'wb') as tmp: schema_index = export_tag config, updated_schema, export_schema_checkpoint = get_export_components( schema_index, previous_export_id, filter) if config: writer = get_writer(format) # get cleaned up headers formatted_headers = self.remap_tables( get_headers(updated_schema, separator=separator)) writer.open(formatted_headers, tmp, max_column_size=max_column_size) total_docs = len(config.potentially_relevant_ids) if process: DownloadBase.set_progress(process, 0, total_docs) for i, doc in config.enum_docs(): if self.transform: doc = self.transform(doc) writer.write( self.remap_tables( get_formatted_rows(doc, updated_schema, include_headers=False, separator=separator))) if process: DownloadBase.set_progress(process, i + 1, total_docs) writer.close() checkpoint = export_schema_checkpoint if checkpoint: if use_cache: cache.set(cache_key, (path, checkpoint), CACHE_TIME) return ExportFiles(path, checkpoint) return None
def export_target(self): writer = get_writer(self.export_format) return writer.target_app