def build_form_multimedia_zip( domain, export_id, datespan, user_types, download_id, owner_id, ): from corehq.apps.export.models import FormExportInstance export = FormExportInstance.get(export_id) form_ids = get_form_ids_having_multimedia( domain, export.app_id, export.xmlns, datespan, user_types ) forms_info = _get_form_attachment_info(domain, form_ids, export) num_forms = len(forms_info) DownloadBase.set_progress(build_form_multimedia_zip, 0, num_forms) all_case_ids = set.union(*(info['case_ids'] for info in forms_info)) if forms_info else set() case_id_to_name = _get_case_names(domain, all_case_ids) with TransientTempfile() as temp_path: with open(temp_path, 'wb') as f: _write_attachments_to_file(temp_path, num_forms, forms_info, case_id_to_name) with open(temp_path, 'rb') as f: zip_name = 'multimedia-{}'.format(unidecode(export.name)) _save_and_expose_zip(f, zip_name, domain, download_id, owner_id) DownloadBase.set_progress(build_form_multimedia_zip, num_forms, num_forms)
def _reconcile_es_data(data_type, metric, blob_parent_id, start=None, end=None, republish=True): today = date.today() if not start: two_days_ago = today - timedelta(days=2) start = two_days_ago.isoformat() with TransientTempfile() as file_path: with open(file_path, 'w') as output_file: call_command('stale_data_in_es', data_type, start=start, end=end, stdout=output_file) with open(file_path, 'r') as f: reader = csv.reader(f, delimiter='\t') # ignore the headers next(reader) counts_by_domain = defaultdict(int) for line in reader: domain = line[3] counts_by_domain[domain] += 1 if counts_by_domain: for domain, count in counts_by_domain.items(): metrics_counter(metric, count, tags={'domain': domain}) else: metrics_counter(metric, 0) if republish: call_command('republish_doc_changes', file_path, skip_domains=True) with open(file_path, 'rb') as f: blob_db = get_blob_db() key = f'{blob_parent_id}_{today.isoformat()}' six_years = 60 * 24 * 365 * 6 blob_db.put( f, type_code=CODES.tempfile, domain='<unknown>', parent_id=blob_parent_id, key=key, timeout=six_years )
def assert_instance_gives_results(docs, export_instance, expected_result): with TransientTempfile() as temp_path: writer = get_export_writer([export_instance], temp_path) with writer.open([export_instance]): write_export_instance(writer, export_instance, docs) with ExportFile(writer.path, writer.format) as export: assert json.loads(export.read()) == expected_result
def build_export_json(self, query_master=False): with TransientTempfile() as temp_path: with open(temp_path, 'w+b') as f: self._write_data(f, query_master) f.seek(0) blob_ref, _ = IcdsFile.objects.get_or_create(blob_id=self._blob_id(), data_type='disha_dumps') blob_ref.store_file_in_blobdb(f, expired=DISHA_DUMP_EXPIRY) blob_ref.save()
def rebuild_export(export_instance, filters=None): """ Rebuild the given daily saved ExportInstance """ filters = filters or export_instance.get_filters() with TransientTempfile() as temp_path: export_file = get_export_file([export_instance], filters or [], temp_path) with export_file as payload: save_export_payload(export_instance, payload)
def populate_export_download_task(export_instances, filters, download_id, filename=None, expiry=10 * 60): """ :param expiry: Time period for the export to be available for download in minutes """ domain = export_instances[0].domain with TransientTempfile() as temp_path, datadog_track_errors( 'populate_export_download_task'): export_file = get_export_file( export_instances, filters, temp_path, # We don't have a great way to calculate progress if it's a bulk download, # so only track the progress for single instance exports. progress_tracker=populate_export_download_task if len(export_instances) == 1 else None) file_format = Format.from_format(export_file.format) filename = filename or export_instances[0].name with export_file as file_: db = get_blob_db() db.put( file_, domain=domain, parent_id=domain, type_code=CODES.data_export, key=download_id, timeout=expiry, ) expose_blob_download( download_id, expiry=expiry * 60, mimetype=file_format.mimetype, content_disposition=safe_filename_header( filename, file_format.extension), download_id=download_id, ) email_requests = EmailExportWhenDoneRequest.objects.filter( domain=domain, download_id=download_id) for email_request in email_requests: try: couch_user = CouchUser.get_by_user_id(email_request.user_id, domain=domain) except CouchUser.AccountTypeError: pass else: if couch_user is not None: process_email_request(domain, download_id, couch_user.get_email()) email_requests.delete()
def rebuild_export(export_instance, progress_tracker): """ Rebuild the given daily saved ExportInstance """ filters = export_instance.get_filters() or [] es_filters = [f.to_es_filter() for f in filters] with TransientTempfile() as temp_path: export_file = get_export_file([export_instance], es_filters, temp_path, progress_tracker) with export_file as payload: save_export_payload(export_instance, payload)
def iter_export_docs(): with TransientTempfile() as temp_path: with open(temp_path, 'w', encoding='utf-8') as f: for doc_id in scroll_result: f.write(doc_id + '\n') # Stream doc ids from disk and fetch documents from ES in chunks with open(temp_path, 'r', encoding='utf-8') as f: doc_ids = (doc_id.strip() for doc_id in f) for doc in iter_es_docs(query.index, doc_ids): yield doc
def _generate_incremental_export(incremental_export): export_instance = incremental_export.export_instance export_instance.export_format = Format.UNZIPPED_CSV # force to unzipped CSV checkpoint = incremental_export.last_valid_checkpoint # Remove the date period from the ExportInstance, since this is added automatically by Daily Saved exports export_instance.filters.date_period = None filters = export_instance.get_filters() if checkpoint: filters.append( ServerModifiedOnRangeFilter(gt=checkpoint.last_doc_date)) class LastDocTracker: def __init__(self, doc_iterator): self.doc_iterator = doc_iterator self.last_doc = None self.doc_count = 0 def __iter__(self): for doc in self.doc_iterator: self.last_doc = doc self.doc_count += 1 yield doc with TransientTempfile() as temp_path, metrics_track_errors( 'generate_incremental_exports'): writer = get_export_writer([export_instance], temp_path, allow_pagination=False) with writer.open([export_instance]): query = _get_export_query(export_instance, filters) query = query.sort('server_modified_on' ) # reset sort to this instead of opened_on docs = LastDocTracker(query.run().hits) write_export_instance(writer, export_instance, docs) export_file = ExportFile(writer.path, writer.format) if docs.doc_count <= 0: return new_checkpoint = incremental_export.checkpoint( docs.doc_count, docs.last_doc.get('server_modified_on')) with export_file as file_: db = get_blob_db() db.put(file_, domain=incremental_export.domain, parent_id=new_checkpoint.blob_parent_id, type_code=CODES.data_export, key=str(new_checkpoint.blob_key), timeout=24 * 60) return new_checkpoint
def _generate_form_multimedia_zipfile(domain, export, form_ids, download_id, owner_id, task_name): forms_info = _get_form_attachment_info(domain, form_ids, export) num_forms = len(forms_info) DownloadBase.set_progress(task_name, 0, num_forms) all_case_ids = set.union(*(info['case_ids'] for info in forms_info)) if forms_info else set() case_id_to_name = _get_case_names(domain, all_case_ids) with TransientTempfile() as temp_path: with open(temp_path, 'wb') as f: _write_attachments_to_file(temp_path, num_forms, forms_info, case_id_to_name) with open(temp_path, 'rb') as f: zip_name = 'multimedia-{}'.format(unidecode(export.name)) _save_and_expose_zip(f, zip_name, domain, download_id, owner_id) DownloadBase.set_progress(task_name, num_forms, num_forms)
def generate_toggle_csv_download(self, tag, download_id, username): toggles = _get_toggles_with_tag(tag) total = _get_toggle_item_count(toggles) current_progress = [0] def increment_progress(): current_progress[0] += 1 DownloadBase.set_progress(self, current_progress[0], total) timeout_mins = 24 * 60 with TransientTempfile() as temp_path: _write_toggle_data(temp_path, toggles, increment_progress) with open(temp_path, 'rb') as file: db = get_blob_db() meta = db.put( file, domain="__system__", parent_id="__system__", type_code=CODES.tempfile, key=download_id, timeout=timeout_mins, ) now = datetime.utcnow().strftime("%Y-%m-%d-%H-%M-%S") filename = f'{settings.SERVER_ENVIRONMENT}_toggle_export_{now}' expose_blob_download( download_id, expiry=timeout_mins * 60, content_disposition=safe_filename_header(filename, ".csv"), download_id=download_id, ) user = CouchUser.get_by_username(username) if user: url = absolute_reverse("retrieve_download", args=[download_id]) url += "?get_file" valid_until = meta.expires_on.replace( tzinfo=pytz.UTC).strftime(USER_DATETIME_FORMAT) send_HTML_email("Feature Flag download ready", user.get_email(), html_content=inspect.cleandoc(f""" Download URL: {url} Download Valid until: {valid_until} """))
def test_multi_table_order(self): tables = [ TableConfiguration( label="My table {}".format(i), selected=True, path=[], columns=[ ExportColumn( label="Q{}".format(i), item=ScalarItem( path=[PathNode(name='form'), PathNode(name='q{}'.format(i))], ), selected=True, ), ] ) for i in range(10) ] export_instance = FormExportInstance( export_format=Format.HTML, tables=tables ) docs = [ { 'domain': 'my-domain', '_id': '1234', "form": {'q{}'.format(i): 'value {}'.format(i) for i in range(10)} } ] with TransientTempfile() as temp_path: writer = get_export_writer([export_instance], temp_path) with writer.open([export_instance]): write_export_instance(writer, export_instance, docs) with ExportFile(writer.path, writer.format) as export: exported_tables = [table for table in re.findall('<table>', export.read())] expected_tables = [t.label for t in tables] self.assertEqual(len(expected_tables), len(exported_tables))
def test_file_content(self, disha_get_rows_mock): class CountableList(list): # this will be mocked for a Queryset object which needs count method def count(self, *args, **kwargs): return len(self) data = [['a'], ['b'], ["d\xc3\xa9f"]] disha_get_rows_mock.return_value = CountableList(data) month = date(2018, 8, 1) state = 'Andhra pradesh' with TransientTempfile() as temp_path: dump = DishaDump(state, month) with open(temp_path, 'w+b') as f: dump._write_data_in_chunks(f) with open(temp_path, 'r', encoding='utf-8') as f: expected_json = { 'month': str(month), 'state_name': state, 'column_names': dump._get_columns(), 'rows': data } self.assertEqual(json.loads(f.read()), expected_json)
def get_export_json(export_instance): with TransientTempfile() as temp_path: export_file = get_export_file([export_instance], [], temp_path) with export_file as export: return json.loads(export.read())
def test_simple_bulk_export(self, export_save): with TransientTempfile() as temp_path: export_file = get_export_file( [ CaseExportInstance( export_format=Format.JSON, domain=DOMAIN, case_type=DEFAULT_CASE_TYPE, tables=[ TableConfiguration( selected=True, label="My table", path=MAIN_TABLE, columns=[ ExportColumn( label="Foo column", item=ExportItem( path=[PathNode(name="foo")]), selected=True, ), ]) ]), CaseExportInstance( export_format=Format.JSON, domain=DOMAIN, case_type=DEFAULT_CASE_TYPE, tables=[ TableConfiguration( label="My table", selected=True, path=MAIN_TABLE, columns=[ ExportColumn( label="Bar column", item=ExportItem( path=[PathNode(name="bar")]), selected=True, ) ]) ]), ], [], # No filters temp_path, ) expected = { 'Export1-My table': { "A1": "Foo column", "A2": "apple", "A3": "apple", "A4": "apple", }, "Export2-My table": { "A1": "Bar column", "A2": "banana", "A3": "banana", "A4": "banana", }, } with export_file as export: wb = load_workbook(export) self.assertEqual(wb.get_sheet_names(), ["Export1-My table", "Export2-My table"]) for sheet in expected.keys(): for cell in expected[sheet].keys(): self.assertEqual( wb[sheet][cell].value, expected[sheet][cell], 'AssertionError: Sheet "{}", cell "{}" expected: "{}", got "{}"' .format(sheet, cell, expected[sheet][cell], wb[sheet][cell].value)) self.assertTrue(export_save.called)
def populate_export_download_task(domain, export_ids, exports_type, username, es_filters, download_id, owner_id, filename=None, expiry=10 * 60): """ :param expiry: Time period for the export to be available for download in minutes """ email_requests = EmailExportWhenDoneRequest.objects.filter( domain=domain, download_id=download_id) if settings.STALE_EXPORT_THRESHOLD is not None and not email_requests.count( ): delay = get_task_time_to_start( populate_export_download_task.request.id) if delay.total_seconds() > settings.STALE_EXPORT_THRESHOLD: metrics_counter('commcare.exports.rejected_unfresh_export') raise RejectedStaleExport() export_instances = [ get_export(exports_type, domain, export_id, username) for export_id in export_ids ] with TransientTempfile() as temp_path, metrics_track_errors( 'populate_export_download_task'): export_file = get_export_file( export_instances, es_filters, temp_path, # We don't have a great way to calculate progress if it's a bulk download, # so only track the progress for single instance exports. progress_tracker=populate_export_download_task if len(export_instances) == 1 else None) file_format = Format.from_format(export_file.format) filename = filename or export_instances[0].name with export_file as file_: db = get_blob_db() db.put( file_, domain=domain, parent_id=domain, type_code=CODES.data_export, key=download_id, timeout=expiry, ) expose_blob_download( download_id, expiry=expiry * 60, mimetype=file_format.mimetype, content_disposition=safe_filename_header( filename, file_format.extension), download_id=download_id, owner_ids=[owner_id], ) for email_request in email_requests: try: couch_user = CouchUser.get_by_user_id(email_request.user_id, domain=domain) except CouchUser.AccountTypeError: pass else: if couch_user is not None: process_email_request(domain, download_id, couch_user.get_email()) email_requests.delete()
def test_multiple_write_export_instance_calls(self, export_save): """ Confirm that calling _write_export_instance() multiple times (as part of a bulk export) works as expected. """ export_instances = [ FormExportInstance(tables=[ TableConfiguration(label="My table", selected=True, path=[], columns=[ ExportColumn( label="Q3", item=ScalarItem(path=[ PathNode(name='form'), PathNode(name='q3') ], ), selected=True, ), ]), ]), FormExportInstance(tables=[ TableConfiguration(label="My other table", selected=True, path=[ PathNode(name="form", is_repeat=False), PathNode(name="q2", is_repeat=False) ], columns=[ ExportColumn( label="Q4", item=ScalarItem(path=[ PathNode(name='form'), PathNode(name='q2'), PathNode(name='q4') ], ), selected=True, ), ]) ]), FormExportInstance(tables=[ TableConfiguration(label="My other table", selected=True, path=[ PathNode(name="form", is_repeat=False), PathNode(name="q2", is_repeat=False) ], columns=[ ExportColumn( label="Q4", item=ScalarItem(path=[ PathNode(name='form'), PathNode(name='q2'), PathNode(name='q4') ], ), selected=True, ), ]) ]) ] with TransientTempfile() as temp_path: writer = _ExportWriter(get_writer(Format.JSON), temp_path) with writer.open(export_instances): write_export_instance(writer, export_instances[0], self.docs) write_export_instance(writer, export_instances[1], self.docs) write_export_instance(writer, export_instances[2], self.docs) with ExportFile(writer.path, writer.format) as export: self.assertEqual( json.loads(export.read()), { 'My table': { 'headers': ['Q3'], 'rows': [['baz'], ['bop']], }, 'Export2-My other table': { 'headers': ['Q4'], 'rows': [['bar'], ['boop']], }, 'Export3-My other table': { 'headers': ['Q4'], 'rows': [['bar'], ['boop']], }, }) self.assertTrue(export_save.called)