def submit_csv(username, xform, csv_file, overwrite=False): """Imports CSV data to an existing form Takes a csv formatted file or string containing rows of submission/instance and converts those to xml submissions and finally submits them by calling :py:func:`onadata.libs.utils.logger_tools.safe_create_instance` :param str username: the submission user :param onadata.apps.logger.models.XForm xform: The submission's XForm. :param (str or file) csv_file: A CSV formatted file with submission rows. :return: If sucessful, a dict with import summary else dict with error str. :rtype: Dict """ csv_file_validation_summary = validate_csv_file(csv_file, xform) if csv_file_validation_summary.get('valid'): additional_col = csv_file_validation_summary.get('additional_col') else: return async_status( FAILED, csv_file_validation_summary.get('error_msg') ) num_rows = sum(1 for row in csv_file) - 1 # Change stream position to start of file csv_file.seek(0) csv_reader = ucsv.DictReader(csv_file, encoding='utf-8-sig') xform_json = json.loads(xform.json) select_multiples = [ qstn.name for qstn in xform.get_survey_elements_of_type(MULTIPLE_SELECT_TYPE)] ona_uuid = {'formhub': {'uuid': xform.uuid}} additions = duplicates = inserts = 0 rollback_uuids = [] errors = {} # Retrieve the columns we should validate values for # Currently validating date, datetime, integer and decimal columns col_to_validate = { 'date': (get_columns_by_type(XLS_DATE_FIELDS, xform_json), parse), 'datetime': ( get_columns_by_type(XLS_DATETIME_FIELDS, xform_json), parse), 'integer': (get_columns_by_type(['integer'], xform_json), int), 'decimal': (get_columns_by_type(['decimal'], xform_json), float) } if overwrite: instance_ids = [i['id'] for i in xform.instances.values('id')] xform.instances.filter(deleted_at__isnull=True)\ .update(deleted_at=timezone.now(), deleted_by=User.objects.get(username=username)) # send message send_message( instance_id=instance_ids, target_id=xform.id, target_type=XFORM, user=User.objects.get(username=username), message_verb=SUBMISSION_DELETED) try: for row_no, row in enumerate(csv_reader): # Remove additional columns for index in additional_col: del row[index] # Remove 'n/a' and '' values from csv row = {k: v for (k, v) in row.items() if v not in [NA_REP, '']} row, error = validate_row(row, col_to_validate) if error: errors[row_no] = error # Only continue the process if no errors where encountered while # validating the data if not errors: location_data = {} for key in list(row): # Collect row location data into separate location_data # dict if key.endswith(('.latitude', '.longitude', '.altitude', '.precision')): location_key, location_prop = key.rsplit(u'.', 1) location_data.setdefault(location_key, {}).update({ location_prop: row.get(key, '0') }) # collect all location K-V pairs into single geopoint field(s) # in location_data dict for location_key in list(location_data): location_data.update({ location_key: (u'%(latitude)s %(longitude)s ' '%(altitude)s %(precision)s') % defaultdict( lambda: '', location_data.get(location_key)) }) nested_dict = csv_dict_to_nested_dict( row, select_multiples=select_multiples) row = flatten_split_select_multiples( nested_dict, select_multiples=select_multiples) location_data = csv_dict_to_nested_dict(location_data) # Merge location_data into the Row data row = dict_merge(row, location_data) submission_time = datetime.utcnow().isoformat() row_uuid = row.get('meta/instanceID') or 'uuid:{}'.format( row.get(UUID)) if row.get(UUID) else None submitted_by = row.get('_submitted_by') submission_date = row.get('_submission_time', submission_time) for key in list(row): # remove metadata (keys starting with '_') if key.startswith('_'): del row[key] # Inject our forms uuid into the submission row.update(ona_uuid) old_meta = row.get('meta', {}) new_meta, update = get_submission_meta_dict(xform, row_uuid) inserts += update old_meta.update(new_meta) row.update({'meta': old_meta}) row_uuid = row.get('meta').get('instanceID') rollback_uuids.append(row_uuid.replace('uuid:', '')) try: xml_file = BytesIO( dict2xmlsubmission( row, xform, row_uuid, submission_date)) try: error, instance = safe_create_instance( username, xml_file, [], xform.uuid, None) except ValueError as e: error = e if error: if not (isinstance(error, OpenRosaResponse) and error.status_code == 202): Instance.objects.filter( uuid__in=rollback_uuids, xform=xform).delete() return async_status(FAILED, text(error)) else: duplicates += 1 else: additions += 1 if additions % PROGRESS_BATCH_UPDATE == 0: try: current_task.update_state( state='PROGRESS', meta={ 'progress': additions, 'total': num_rows, 'info': additional_col }) except Exception: logging.exception( _(u'Could not update state of ' 'import CSV batch process.')) finally: xform.submission_count(True) users = User.objects.filter( username=submitted_by) if submitted_by else [] if users: instance.user = users[0] instance.save() except Exception as e: return failed_import(rollback_uuids, xform, e, text(e)) except UnicodeDecodeError as e: return failed_import(rollback_uuids, xform, e, 'CSV file must be utf-8 encoded') if errors: # Rollback all created instances if an error occurred during # validation Instance.objects.filter( uuid__in=rollback_uuids, xform=xform).delete() xform.submission_count(True) return async_status( FAILED, u'Invalid CSV data imported in row(s): {}'.format( errors) if errors else '' ) else: added_submissions = additions - inserts event_by = User.objects.get(username=username) event_name = None tracking_properties = { 'xform_id': xform.pk, 'project_id': xform.project.pk, 'submitted_by': event_by, 'label': f'csv-import-for-form-{xform.pk}', 'from': 'CSV Import', } if added_submissions > 0: tracking_properties['value'] = added_submissions event_name = INSTANCE_CREATE_EVENT analytics.track( event_by, event_name, properties=tracking_properties) if inserts > 0: tracking_properties['value'] = inserts event_name = INSTANCE_UPDATE_EVENT analytics.track( event_by, event_name, properties=tracking_properties) return { 'additions': added_submissions, 'duplicates': duplicates, 'updates': inserts, 'info': "Additional column(s) excluded from the upload: '{0}'." .format(', '.join(list(additional_col)))}
def submit_csv(username, xform, csv_file, overwrite=False): """Imports CSV data to an existing form Takes a csv formatted file or string containing rows of submission/instance and converts those to xml submissions and finally submits them by calling :py:func:`onadata.libs.utils.logger_tools.safe_create_instance` :param str username: the subission user :param onadata.apps.logger.models.XForm xfrom: The submission's XForm. :param (str or file): A CSV formatted file with submission rows. :return: If sucessful, a dict with import summary else dict with error str. :rtype: Dict """ if isinstance(csv_file, str): csv_file = BytesIO(csv_file) elif csv_file is None or not hasattr(csv_file, 'read'): return async_status( FAILED, (u'Invalid param type for `csv_file`. ' 'Expected utf-8 encoded file or unicode' ' string got {} instead.'.format(type(csv_file).__name__))) num_rows = sum(1 for row in csv_file) - 1 csv_file.seek(0) csv_reader = ucsv.DictReader(csv_file, encoding='utf-8-sig') csv_header = csv_reader.fieldnames # check for spaces in headers if any(' ' in header for header in csv_header): return async_status(FAILED, u'CSV file fieldnames should not contain spaces') # Get the data dictionary xform_header = xform.get_headers() missing_col = set(xform_header).difference(csv_header) addition_col = set(csv_header).difference(xform_header) # change to list missing_col = list(missing_col) addition_col = list(addition_col) # remove all metadata columns missing = [ col for col in missing_col if not col.startswith("_") and col not in IGNORED_COLUMNS ] # remove all metadata inside groups missing = [col for col in missing if '/_' not in col] # ignore if is multiple select question for col in csv_header: # this col is a multiple select question survey_element = xform.get_survey_element(col) if survey_element and \ survey_element.get('type') == MULTIPLE_SELECT_TYPE: # remove from the missing and additional list missing = [x for x in missing if not x.startswith(col)] addition_col.remove(col) # remove headers for repeats that might be missing from csv missing = sorted([m for m in missing if m.find('[') == -1]) # Include additional repeats addition_col = [a for a in addition_col if a.find('[') == -1] if missing: return async_status( FAILED, u"Sorry uploaded file does not match the form. " u"The file is missing the column(s): " u"{0}.".format(', '.join(missing))) if overwrite: xform.instances.filter(deleted_at__isnull=True)\ .update(deleted_at=timezone.now(), deleted_by=User.objects.get(username=username)) rollback_uuids = [] submission_time = datetime.utcnow().isoformat() ona_uuid = {'formhub': {'uuid': xform.uuid}} error = None additions = duplicates = inserts = 0 try: for row in csv_reader: # remove the additional columns for index in addition_col: del row[index] # fetch submission uuid before purging row metadata row_uuid = row.get('meta/instanceID') or row.get('_uuid') submitted_by = row.get('_submitted_by') submission_date = row.get('_submission_time', submission_time) location_data = {} for key in list(row): # seems faster than a comprehension # remove metadata (keys starting with '_') if key.startswith('_'): del row[key] # Collect row location data into separate location_data dict if key.endswith( ('.latitude', '.longitude', '.altitude', '.precision')): location_key, location_prop = key.rsplit(u'.', 1) location_data.setdefault(location_key, {}).update( {location_prop: row.get(key, '0')}) # remove 'n/a' values if not key.startswith('_') and row[key] == 'n/a': del row[key] # collect all location K-V pairs into single geopoint field(s) # in location_data dict for location_key in list(location_data): location_data.update({ location_key: (u'%(latitude)s %(longitude)s ' '%(altitude)s %(precision)s') % defaultdict(lambda: '', location_data.get(location_key)) }) row = csv_dict_to_nested_dict(row) location_data = csv_dict_to_nested_dict(location_data) row = dict_merge(row, location_data) # inject our form's uuid into the submission row.update(ona_uuid) old_meta = row.get('meta', {}) new_meta, update = get_submission_meta_dict(xform, row_uuid) inserts += update old_meta.update(new_meta) row.update({'meta': old_meta}) row_uuid = row.get('meta').get('instanceID') rollback_uuids.append(row_uuid.replace('uuid:', '')) xml_file = BytesIO( dict2xmlsubmission(row, xform, row_uuid, submission_date)) try: error, instance = safe_create_instance(username, xml_file, [], xform.uuid, None) except ValueError as e: error = e if error: if not (isinstance(error, OpenRosaResponse) and error.status_code == 202): Instance.objects.filter(uuid__in=rollback_uuids, xform=xform).delete() return async_status(FAILED, text(error)) else: duplicates += 1 else: additions += 1 if additions % PROGRESS_BATCH_UPDATE == 0: try: current_task.update_state(state='PROGRESS', meta={ 'progress': additions, 'total': num_rows, 'info': addition_col }) print(current_task) except Exception: logging.exception( _(u'Could not update state of ' 'import CSV batch process.')) finally: xform.submission_count(True) users = User.objects.filter( username=submitted_by) if submitted_by else [] if users: instance.user = users[0] instance.save() except UnicodeDecodeError as e: return failed_import(rollback_uuids, xform, e, u'CSV file must be utf-8 encoded') except Exception as e: return failed_import(rollback_uuids, xform, e, text(e)) finally: xform.submission_count(True) return { "additions": additions - inserts, "duplicates": duplicates, u"updates": inserts, u"info": u"Additional column(s) excluded from the upload: '{0}'." .format(', '.join(list(addition_col))) } # yapf: disable
def test_csv_repeat_field_to_dict(self): a = {'repeat[1]/gender': 'female'} b = {'repeat': [{'gender': 'female'}]} c = csv_dict_to_nested_dict(a) self.assertDictEqual(c, b) self.assertEqual( dict2xml(c), """ <repeat> <gender>female</gender> </repeat> """.strip() ) a = {'group/repeat[1]/gender': 'female'} b = {'group': {'repeat': [{'gender': 'female'}]}} c = csv_dict_to_nested_dict(a) self.assertDictEqual(c, b) self.assertEqual( dict2xml(c), """ <group> <repeat> <gender>female</gender> </repeat> </group> """.strip() ) a = {'group/repeat[1]/groupb/gender': 'female'} b = {'group': {'repeat': [{'groupb': {'gender': 'female'}}]}} c = csv_dict_to_nested_dict(a) self.assertDictEqual(c, b) self.assertEqual( dict2xml(c), """ <group> <repeat> <groupb> <gender>female</gender> </groupb> </repeat> </group> """.strip() ) a = {'repeata[1]/repeatb[1]/gender': 'female'} b = {'repeata': [{'repeatb': [{'gender': 'female'}]}]} c = csv_dict_to_nested_dict(a) self.assertDictEqual(c, b) self.assertEqual( dict2xml(c), """ <repeata> <repeatb> <gender>female</gender> </repeatb> </repeata> """.strip() ) a = { 'repeat[1]/gender': 'female', 'repeat[1]/age': 10 } b = { 'repeat': [{ 'gender': 'female', 'age': 10 }] } c = csv_dict_to_nested_dict(a) self.assertDictEqual(c, b) self.assertEqual( dict2xml(c), """ <repeat> <age>10</age> <gender>female</gender> </repeat> """.strip() ) a = { 'group/repeat[1]/gender': 'female', 'group/repeat[1]/age': 10 } b = { 'group': { 'repeat': [{ 'gender': 'female', 'age': 10 }] } } c = csv_dict_to_nested_dict(a) self.assertDictEqual(c, b) self.assertEqual( dict2xml(c), """ <group> <repeat> <age>10</age> <gender>female</gender> </repeat> </group> """.strip() ) a = { 'group/repeat[1]/groupb/gender': 'female', 'group/repeat[1]/groupb/age': 10 } b = { 'group': { 'repeat': [{ 'groupb': { 'gender': 'female', 'age': 10 } }] } } c = csv_dict_to_nested_dict(a) self.assertDictEqual(c, b) self.assertEqual( dict2xml(c), """ <group> <repeat> <groupb> <age>10</age> <gender>female</gender> </groupb> </repeat> </group> """.strip() ) a = { 'repeata[1]/repeatb[1]/gender': 'female', 'repeata[1]/repeatb[1]/name': 'Swan', 'repeata[1]/repeatb[1]/age': 10 } b = { 'repeata': [{ 'repeatb': [{ 'gender': 'female', 'name': 'Swan', 'age': 10 }] }] } c = csv_dict_to_nested_dict(a) self.assertDictEqual(c, b) self.assertEqual( dict2xml(c), """ <repeata> <repeatb> <age>10</age> <gender>female</gender> <name>Swan</name> </repeatb> </repeata> """.strip() ) a = { 'repeat[1]/gender': 'female', 'repeat[2]/gender': 'male' } b = { 'repeat': [{ 'gender': 'female', }, { 'gender': 'male', }] } c = csv_dict_to_nested_dict(a) self.assertDictEqual(c, b) self.assertEqual( dict2xml(c), """ <repeat> <gender>female</gender> </repeat> <repeat> <gender>male</gender> </repeat> """.strip() ) a = { 'repeat[1]/gender': 'female', 'repeat[1]/age': 10, 'repeat[2]/gender': 'male' } b = { 'repeat': [{ 'gender': 'female', 'age': 10 }, { 'gender': 'male', }] } c = csv_dict_to_nested_dict(a) self.assertDictEqual(c, b) self.assertEqual( dict2xml(c), """ <repeat> <age>10</age> <gender>female</gender> </repeat> <repeat> <gender>male</gender> </repeat> """.strip() ) a = { 'group/repeat[1]/gender': 'female', 'group/repeat[1]/age': 10, 'repeat[1]/gender': 'male' } b = { 'group': { 'repeat': [{ 'gender': 'female', 'age': 10 }] }, 'repeat': [{ 'gender': 'male', }] } c = csv_dict_to_nested_dict(a) self.assertDictEqual(c, b) self.assertEqual( dict2xml(c), """ <group> <repeat> <age>10</age> <gender>female</gender> </repeat> </group> <repeat> <gender>male</gender> </repeat> """.strip() ) a = { 'group/repeat[1]/gender': 'female', 'group/repeat[1]/age': 10, 'group/repeat[2]/gender': 'male' } b = { 'group': { 'repeat': [{ 'gender': 'female', 'age': 10 }, { 'gender': 'male', }] } } c = csv_dict_to_nested_dict(a) self.assertDictEqual(c, b) self.assertEqual( dict2xml(c), """ <group> <repeat> <age>10</age> <gender>female</gender> </repeat> <repeat> <gender>male</gender> </repeat> </group> """.strip() ) a = { 'repeata[1]/repeat[1]/groupb/gender': 'female', 'repeata[1]/repeat[1]/groupb/age': 10, 'repeata[1]/repeat[2]/groupb/gender': 'male', 'repeata[2]/repeat[1]/groupb/gender': 'male' } b = { 'repeata': [{ 'repeat': [{ 'groupb': { 'gender': 'female', 'age': 10 } }, { 'groupb': { 'gender': 'male', } }] }, { 'repeat': [{ 'groupb': { 'gender': 'male', } }] }] } c = csv_dict_to_nested_dict(a) self.assertDictEqual(c, b) self.assertEqual( dict2xml(c), """ <repeata> <repeat> <groupb> <age>10</age> <gender>female</gender> </groupb> </repeat> <repeat> <groupb> <gender>male</gender> </groupb> </repeat> </repeata> <repeata> <repeat> <groupb> <gender>male</gender> </groupb> </repeat> </repeata> """.strip() ) a = { 'group/repeat[1]/groupb/gender': 'female', 'group/repeat[1]/groupb/age': 10, 'group/repeat[2]/groupb/gender': 'male' } b = { 'group': { 'repeat': [{ 'groupb': { 'gender': 'female', 'age': 10 } }, { 'groupb': { 'gender': 'male', } }] } } c = csv_dict_to_nested_dict(a) self.assertDictEqual(c, b) self.assertEqual( dict2xml(c), """ <group> <repeat> <groupb> <age>10</age> <gender>female</gender> </groupb> </repeat> <repeat> <groupb> <gender>male</gender> </groupb> </repeat> </group> """.strip() )