Exemplo n.º 1
0
def submit_csv(username, xform, csv_file, overwrite=False):
    """Imports CSV data to an existing form

    Takes a csv formatted file or string containing rows of submission/instance
    and converts those to xml submissions and finally submits them by calling
    :py:func:`onadata.libs.utils.logger_tools.safe_create_instance`

    :param str username: the submission user
    :param onadata.apps.logger.models.XForm xform: The submission's XForm.
    :param (str or file) csv_file: A CSV formatted file with submission rows.
    :return: If sucessful, a dict with import summary else dict with error str.
    :rtype: Dict
    """
    csv_file_validation_summary = validate_csv_file(csv_file, xform)

    if csv_file_validation_summary.get('valid'):
        additional_col = csv_file_validation_summary.get('additional_col')
    else:
        return async_status(
            FAILED,
            csv_file_validation_summary.get('error_msg')
        )

    num_rows = sum(1 for row in csv_file) - 1

    # Change stream position to start of file
    csv_file.seek(0)

    csv_reader = ucsv.DictReader(csv_file, encoding='utf-8-sig')
    xform_json = json.loads(xform.json)
    select_multiples = [
        qstn.name for qstn in
        xform.get_survey_elements_of_type(MULTIPLE_SELECT_TYPE)]
    ona_uuid = {'formhub': {'uuid': xform.uuid}}
    additions = duplicates = inserts = 0
    rollback_uuids = []
    errors = {}

    # Retrieve the columns we should validate values for
    # Currently validating date, datetime, integer and decimal columns
    col_to_validate = {
        'date': (get_columns_by_type(XLS_DATE_FIELDS, xform_json), parse),
        'datetime': (
            get_columns_by_type(XLS_DATETIME_FIELDS, xform_json), parse),
        'integer': (get_columns_by_type(['integer'], xform_json), int),
        'decimal': (get_columns_by_type(['decimal'], xform_json), float)
    }

    if overwrite:
        instance_ids = [i['id'] for i in xform.instances.values('id')]
        xform.instances.filter(deleted_at__isnull=True)\
            .update(deleted_at=timezone.now(),
                    deleted_by=User.objects.get(username=username))
        # send message
        send_message(
            instance_id=instance_ids, target_id=xform.id,
            target_type=XFORM, user=User.objects.get(username=username),
            message_verb=SUBMISSION_DELETED)

    try:
        for row_no, row in enumerate(csv_reader):
            # Remove additional columns
            for index in additional_col:
                del row[index]

            # Remove 'n/a' and '' values from csv
            row = {k: v for (k, v) in row.items() if v not in [NA_REP, '']}

            row, error = validate_row(row, col_to_validate)

            if error:
                errors[row_no] = error

            # Only continue the process if no errors where encountered while
            # validating the data
            if not errors:
                location_data = {}

                for key in list(row):
                    # Collect row location data into separate location_data
                    # dict
                    if key.endswith(('.latitude', '.longitude', '.altitude',
                                    '.precision')):
                        location_key, location_prop = key.rsplit(u'.', 1)
                        location_data.setdefault(location_key, {}).update({
                            location_prop:
                            row.get(key, '0')
                        })

                # collect all location K-V pairs into single geopoint field(s)
                # in location_data dict
                for location_key in list(location_data):
                    location_data.update({
                        location_key:
                        (u'%(latitude)s %(longitude)s '
                            '%(altitude)s %(precision)s') % defaultdict(
                            lambda: '', location_data.get(location_key))
                    })

                nested_dict = csv_dict_to_nested_dict(
                    row, select_multiples=select_multiples)
                row = flatten_split_select_multiples(
                    nested_dict, select_multiples=select_multiples)
                location_data = csv_dict_to_nested_dict(location_data)
                # Merge location_data into the Row data
                row = dict_merge(row, location_data)

                submission_time = datetime.utcnow().isoformat()
                row_uuid = row.get('meta/instanceID') or 'uuid:{}'.format(
                    row.get(UUID)) if row.get(UUID) else None
                submitted_by = row.get('_submitted_by')
                submission_date = row.get('_submission_time', submission_time)

                for key in list(row):
                    # remove metadata (keys starting with '_')
                    if key.startswith('_'):
                        del row[key]

                # Inject our forms uuid into the submission
                row.update(ona_uuid)

                old_meta = row.get('meta', {})
                new_meta, update = get_submission_meta_dict(xform, row_uuid)
                inserts += update
                old_meta.update(new_meta)
                row.update({'meta': old_meta})

                row_uuid = row.get('meta').get('instanceID')
                rollback_uuids.append(row_uuid.replace('uuid:', ''))

                try:
                    xml_file = BytesIO(
                        dict2xmlsubmission(
                            row, xform, row_uuid, submission_date))

                    try:
                        error, instance = safe_create_instance(
                            username, xml_file, [], xform.uuid, None)
                    except ValueError as e:
                        error = e

                    if error:
                        if not (isinstance(error, OpenRosaResponse)
                                and error.status_code == 202):
                            Instance.objects.filter(
                                uuid__in=rollback_uuids, xform=xform).delete()
                            return async_status(FAILED, text(error))
                        else:
                            duplicates += 1
                    else:
                        additions += 1

                        if additions % PROGRESS_BATCH_UPDATE == 0:
                            try:
                                current_task.update_state(
                                    state='PROGRESS',
                                    meta={
                                        'progress': additions,
                                        'total': num_rows,
                                        'info': additional_col
                                    })
                            except Exception:
                                logging.exception(
                                    _(u'Could not update state of '
                                        'import CSV batch process.'))
                            finally:
                                xform.submission_count(True)

                        users = User.objects.filter(
                            username=submitted_by) if submitted_by else []
                        if users:
                            instance.user = users[0]
                            instance.save()
                except Exception as e:
                    return failed_import(rollback_uuids, xform, e, text(e))
    except UnicodeDecodeError as e:
        return failed_import(rollback_uuids, xform, e,
                             'CSV file must be utf-8 encoded')

    if errors:
        # Rollback all created instances if an error occurred during
        # validation
        Instance.objects.filter(
            uuid__in=rollback_uuids, xform=xform).delete()
        xform.submission_count(True)
        return async_status(
            FAILED,
            u'Invalid CSV data imported in row(s): {}'.format(
                errors) if errors else ''
        )
    else:
        added_submissions = additions - inserts
        event_by = User.objects.get(username=username)
        event_name = None
        tracking_properties = {
            'xform_id': xform.pk,
            'project_id': xform.project.pk,
            'submitted_by': event_by,
            'label': f'csv-import-for-form-{xform.pk}',
            'from': 'CSV Import',
        }
        if added_submissions > 0:
            tracking_properties['value'] = added_submissions
            event_name = INSTANCE_CREATE_EVENT
            analytics.track(
                event_by, event_name, properties=tracking_properties)

        if inserts > 0:
            tracking_properties['value'] = inserts
            event_name = INSTANCE_UPDATE_EVENT
            analytics.track(
                event_by, event_name, properties=tracking_properties)

        return {
            'additions': added_submissions,
            'duplicates': duplicates,
            'updates': inserts,
            'info': "Additional column(s) excluded from the upload: '{0}'."
            .format(', '.join(list(additional_col)))}
Exemplo n.º 2
0
def submit_csv(username, xform, csv_file, overwrite=False):
    """Imports CSV data to an existing form

    Takes a csv formatted file or string containing rows of submission/instance
    and converts those to xml submissions and finally submits them by calling
    :py:func:`onadata.libs.utils.logger_tools.safe_create_instance`

    :param str username: the subission user
    :param onadata.apps.logger.models.XForm xfrom: The submission's XForm.
    :param (str or file): A CSV formatted file with submission rows.
    :return: If sucessful, a dict with import summary else dict with error str.
    :rtype: Dict
    """
    if isinstance(csv_file, str):
        csv_file = BytesIO(csv_file)
    elif csv_file is None or not hasattr(csv_file, 'read'):
        return async_status(
            FAILED,
            (u'Invalid param type for `csv_file`. '
             'Expected utf-8 encoded file or unicode'
             ' string got {} instead.'.format(type(csv_file).__name__)))

    num_rows = sum(1 for row in csv_file) - 1
    csv_file.seek(0)

    csv_reader = ucsv.DictReader(csv_file, encoding='utf-8-sig')
    csv_header = csv_reader.fieldnames

    # check for spaces in headers
    if any(' ' in header for header in csv_header):
        return async_status(FAILED,
                            u'CSV file fieldnames should not contain spaces')

    # Get the data dictionary
    xform_header = xform.get_headers()

    missing_col = set(xform_header).difference(csv_header)
    addition_col = set(csv_header).difference(xform_header)

    # change to list
    missing_col = list(missing_col)
    addition_col = list(addition_col)
    # remove all metadata columns
    missing = [
        col for col in missing_col
        if not col.startswith("_") and col not in IGNORED_COLUMNS
    ]

    # remove all metadata inside groups
    missing = [col for col in missing if '/_' not in col]

    # ignore if is multiple select question
    for col in csv_header:
        # this col is a multiple select question
        survey_element = xform.get_survey_element(col)
        if survey_element and \
                survey_element.get('type') == MULTIPLE_SELECT_TYPE:
            # remove from the missing and additional list
            missing = [x for x in missing if not x.startswith(col)]

            addition_col.remove(col)

    # remove headers for repeats that might be missing from csv
    missing = sorted([m for m in missing if m.find('[') == -1])

    # Include additional repeats
    addition_col = [a for a in addition_col if a.find('[') == -1]

    if missing:
        return async_status(
            FAILED, u"Sorry uploaded file does not match the form. "
            u"The file is missing the column(s): "
            u"{0}.".format(', '.join(missing)))

    if overwrite:
        xform.instances.filter(deleted_at__isnull=True)\
            .update(deleted_at=timezone.now(),
                    deleted_by=User.objects.get(username=username))

    rollback_uuids = []
    submission_time = datetime.utcnow().isoformat()
    ona_uuid = {'formhub': {'uuid': xform.uuid}}
    error = None
    additions = duplicates = inserts = 0
    try:
        for row in csv_reader:
            # remove the additional columns
            for index in addition_col:
                del row[index]

            # fetch submission uuid before purging row metadata
            row_uuid = row.get('meta/instanceID') or row.get('_uuid')
            submitted_by = row.get('_submitted_by')
            submission_date = row.get('_submission_time', submission_time)

            location_data = {}
            for key in list(row):  # seems faster than a comprehension
                # remove metadata (keys starting with '_')
                if key.startswith('_'):
                    del row[key]

                # Collect row location data into separate location_data dict
                if key.endswith(
                    ('.latitude', '.longitude', '.altitude', '.precision')):
                    location_key, location_prop = key.rsplit(u'.', 1)
                    location_data.setdefault(location_key, {}).update(
                        {location_prop: row.get(key, '0')})
                # remove 'n/a' values
                if not key.startswith('_') and row[key] == 'n/a':
                    del row[key]

            # collect all location K-V pairs into single geopoint field(s)
            # in location_data dict
            for location_key in list(location_data):
                location_data.update({
                    location_key: (u'%(latitude)s %(longitude)s '
                                   '%(altitude)s %(precision)s') %
                    defaultdict(lambda: '', location_data.get(location_key))
                })

            row = csv_dict_to_nested_dict(row)
            location_data = csv_dict_to_nested_dict(location_data)

            row = dict_merge(row, location_data)

            # inject our form's uuid into the submission
            row.update(ona_uuid)

            old_meta = row.get('meta', {})
            new_meta, update = get_submission_meta_dict(xform, row_uuid)
            inserts += update
            old_meta.update(new_meta)
            row.update({'meta': old_meta})

            row_uuid = row.get('meta').get('instanceID')
            rollback_uuids.append(row_uuid.replace('uuid:', ''))

            xml_file = BytesIO(
                dict2xmlsubmission(row, xform, row_uuid, submission_date))

            try:
                error, instance = safe_create_instance(username, xml_file, [],
                                                       xform.uuid, None)
            except ValueError as e:
                error = e

            if error:
                if not (isinstance(error, OpenRosaResponse)
                        and error.status_code == 202):
                    Instance.objects.filter(uuid__in=rollback_uuids,
                                            xform=xform).delete()
                    return async_status(FAILED, text(error))
                else:
                    duplicates += 1
            else:
                additions += 1
                if additions % PROGRESS_BATCH_UPDATE == 0:
                    try:
                        current_task.update_state(state='PROGRESS',
                                                  meta={
                                                      'progress': additions,
                                                      'total': num_rows,
                                                      'info': addition_col
                                                  })
                        print(current_task)
                    except Exception:
                        logging.exception(
                            _(u'Could not update state of '
                              'import CSV batch process.'))
                    finally:
                        xform.submission_count(True)

                users = User.objects.filter(
                    username=submitted_by) if submitted_by else []
                if users:
                    instance.user = users[0]
                    instance.save()

    except UnicodeDecodeError as e:
        return failed_import(rollback_uuids, xform, e,
                             u'CSV file must be utf-8 encoded')
    except Exception as e:
        return failed_import(rollback_uuids, xform, e, text(e))
    finally:
        xform.submission_count(True)

    return {
        "additions": additions - inserts,
        "duplicates": duplicates,
        u"updates": inserts,
        u"info": u"Additional column(s) excluded from the upload: '{0}'."
                 .format(', '.join(list(addition_col)))
    }  # yapf: disable
Exemplo n.º 3
0
    def test_csv_repeat_field_to_dict(self):
        a = {'repeat[1]/gender': 'female'}
        b = {'repeat': [{'gender': 'female'}]}
        c = csv_dict_to_nested_dict(a)

        self.assertDictEqual(c, b)
        self.assertEqual(
            dict2xml(c),
            """
<repeat>
  <gender>female</gender>
</repeat>
            """.strip()
        )

        a = {'group/repeat[1]/gender': 'female'}
        b = {'group': {'repeat': [{'gender': 'female'}]}}
        c = csv_dict_to_nested_dict(a)

        self.assertDictEqual(c, b)
        self.assertEqual(
            dict2xml(c),
            """
<group>
  <repeat>
    <gender>female</gender>
  </repeat>
</group>
            """.strip()
        )

        a = {'group/repeat[1]/groupb/gender': 'female'}
        b = {'group': {'repeat': [{'groupb': {'gender': 'female'}}]}}
        c = csv_dict_to_nested_dict(a)

        self.assertDictEqual(c, b)
        self.assertEqual(
            dict2xml(c),
            """
<group>
  <repeat>
    <groupb>
      <gender>female</gender>
    </groupb>
  </repeat>
</group>
            """.strip()
        )

        a = {'repeata[1]/repeatb[1]/gender': 'female'}
        b = {'repeata': [{'repeatb': [{'gender': 'female'}]}]}
        c = csv_dict_to_nested_dict(a)

        self.assertDictEqual(c, b)
        self.assertEqual(
            dict2xml(c),
            """
<repeata>
  <repeatb>
    <gender>female</gender>
  </repeatb>
</repeata>
            """.strip()
        )

        a = {
            'repeat[1]/gender': 'female',
            'repeat[1]/age': 10
        }
        b = {
            'repeat': [{
                'gender': 'female',
                'age': 10
            }]
        }
        c = csv_dict_to_nested_dict(a)

        self.assertDictEqual(c, b)
        self.assertEqual(
            dict2xml(c),
            """
<repeat>
  <age>10</age>
  <gender>female</gender>
</repeat>
            """.strip()
        )

        a = {
            'group/repeat[1]/gender': 'female',
            'group/repeat[1]/age': 10
        }
        b = {
            'group': {
                'repeat': [{
                    'gender': 'female',
                    'age': 10
                }]
            }
        }
        c = csv_dict_to_nested_dict(a)

        self.assertDictEqual(c, b)
        self.assertEqual(
            dict2xml(c),
            """
<group>
  <repeat>
    <age>10</age>
    <gender>female</gender>
  </repeat>
</group>
            """.strip()
        )

        a = {
            'group/repeat[1]/groupb/gender': 'female',
            'group/repeat[1]/groupb/age': 10
        }
        b = {
            'group': {
                'repeat': [{
                    'groupb': {
                        'gender': 'female',
                        'age': 10
                    }
                }]
            }
        }
        c = csv_dict_to_nested_dict(a)

        self.assertDictEqual(c, b)
        self.assertEqual(
            dict2xml(c),
            """
<group>
  <repeat>
    <groupb>
      <age>10</age>
      <gender>female</gender>
    </groupb>
  </repeat>
</group>
            """.strip()
        )

        a = {
            'repeata[1]/repeatb[1]/gender': 'female',
            'repeata[1]/repeatb[1]/name': 'Swan',
            'repeata[1]/repeatb[1]/age': 10
        }
        b = {
            'repeata': [{
                'repeatb': [{
                    'gender': 'female',
                    'name': 'Swan',
                    'age': 10
                }]
            }]
        }
        c = csv_dict_to_nested_dict(a)

        self.assertDictEqual(c, b)
        self.assertEqual(
            dict2xml(c),
            """
<repeata>
  <repeatb>
    <age>10</age>
    <gender>female</gender>
    <name>Swan</name>
  </repeatb>
</repeata>
            """.strip()
        )

        a = {
            'repeat[1]/gender': 'female',
            'repeat[2]/gender': 'male'
        }
        b = {
            'repeat': [{
                'gender': 'female',
            }, {
                'gender': 'male',
            }]
        }
        c = csv_dict_to_nested_dict(a)

        self.assertDictEqual(c, b)
        self.assertEqual(
            dict2xml(c),
            """
<repeat>
  <gender>female</gender>
</repeat>
<repeat>
  <gender>male</gender>
</repeat>
            """.strip()
        )

        a = {
            'repeat[1]/gender': 'female',
            'repeat[1]/age': 10,
            'repeat[2]/gender': 'male'
        }
        b = {
            'repeat': [{
                'gender': 'female',
                'age': 10
            }, {
                'gender': 'male',
            }]
        }
        c = csv_dict_to_nested_dict(a)

        self.assertDictEqual(c, b)
        self.assertEqual(
            dict2xml(c),
            """
<repeat>
  <age>10</age>
  <gender>female</gender>
</repeat>
<repeat>
  <gender>male</gender>
</repeat>
            """.strip()
        )

        a = {
            'group/repeat[1]/gender': 'female',
            'group/repeat[1]/age': 10,
            'repeat[1]/gender': 'male'
        }
        b = {
            'group': {
                'repeat': [{
                    'gender': 'female',
                    'age': 10
                }]
            },
            'repeat': [{
                'gender': 'male',
            }]
        }
        c = csv_dict_to_nested_dict(a)

        self.assertDictEqual(c, b)
        self.assertEqual(
            dict2xml(c),
            """
<group>
  <repeat>
    <age>10</age>
    <gender>female</gender>
  </repeat>
</group>
<repeat>
  <gender>male</gender>
</repeat>
            """.strip()
        )

        a = {
            'group/repeat[1]/gender': 'female',
            'group/repeat[1]/age': 10,
            'group/repeat[2]/gender': 'male'
        }
        b = {
            'group': {
                'repeat': [{
                    'gender': 'female',
                    'age': 10
                }, {
                    'gender': 'male',
                }]
            }
        }
        c = csv_dict_to_nested_dict(a)

        self.assertDictEqual(c, b)
        self.assertEqual(
            dict2xml(c),
            """
<group>
  <repeat>
    <age>10</age>
    <gender>female</gender>
  </repeat>
  <repeat>
    <gender>male</gender>
  </repeat>
</group>
            """.strip()
        )

        a = {
            'repeata[1]/repeat[1]/groupb/gender': 'female',
            'repeata[1]/repeat[1]/groupb/age': 10,
            'repeata[1]/repeat[2]/groupb/gender': 'male',
            'repeata[2]/repeat[1]/groupb/gender': 'male'
        }
        b = {
            'repeata': [{
                'repeat': [{
                    'groupb': {
                        'gender': 'female',
                        'age': 10
                    }
                }, {
                    'groupb': {
                        'gender': 'male',
                    }
                }]
            }, {
                'repeat': [{
                    'groupb': {
                        'gender': 'male',
                    }
                }]
            }]
        }
        c = csv_dict_to_nested_dict(a)

        self.assertDictEqual(c, b)
        self.assertEqual(
            dict2xml(c),
            """
<repeata>
  <repeat>
    <groupb>
      <age>10</age>
      <gender>female</gender>
    </groupb>
  </repeat>
  <repeat>
    <groupb>
      <gender>male</gender>
    </groupb>
  </repeat>
</repeata>
<repeata>
  <repeat>
    <groupb>
      <gender>male</gender>
    </groupb>
  </repeat>
</repeata>
            """.strip()
        )

        a = {
            'group/repeat[1]/groupb/gender': 'female',
            'group/repeat[1]/groupb/age': 10,
            'group/repeat[2]/groupb/gender': 'male'
        }
        b = {
            'group': {
                'repeat': [{
                    'groupb': {
                        'gender': 'female',
                        'age': 10
                    }
                }, {
                    'groupb': {
                        'gender': 'male',
                    }
                }]
            }
        }
        c = csv_dict_to_nested_dict(a)

        self.assertDictEqual(c, b)
        self.assertEqual(
            dict2xml(c),
            """
<group>
  <repeat>
    <groupb>
      <age>10</age>
      <gender>female</gender>
    </groupb>
  </repeat>
  <repeat>
    <groupb>
      <gender>male</gender>
    </groupb>
  </repeat>
</group>
            """.strip()
        )