예제 #1
0
    def test_clean_keys(self):
        LabelValueStore.objects(silo_id=self.silo.id).delete()
        lvs = LabelValueStore()
        orig_data = {
            'Header 1': 'r1c1',
            'create_date': 'r1c3',
            'edit_date': 'r1c2',
            '_id': 'r1c4'
        }

        for k, v in orig_data.iteritems():
            key = cleanKey(k)
            val = smart_str(v, strings_only=True)
            key = smart_str(key)
            val = val.strip()
            setattr(lvs, key, val)
        lvs.silo_id = self.silo.id
        lvs.save()

        returned_data = json.loads(LabelValueStore.objects(
            silo_id=self.silo.id).to_json())[0]
        returned_data.pop('_id')

        expected_data = {
            'Header 1': 'r1c1',
            'created_date': 'r1c3',
            'editted_date': 'r1c2',
            'user_assigned_id': 'r1c4',
            'read_id': -1,
            'silo_id': self.silo.id
        }
        self.assertEqual(returned_data, expected_data)
        LabelValueStore.objects(silo_id=self.silo.id).delete()
예제 #2
0
def storeCommCareData(conf, data):
    data_refined = []
    try:
        fieldToType = getColToTypeDict(Silo.objects.get(pk=conf['silo_id']))
    except Silo.DoesNotExist:
        fieldToType = {}
    for row in data:
        for column in row:
            if fieldToType.get(column, 'string') == 'int':
                try:
                    row[column] = int(row[column])
                except ValueError:
                    # skip this one
                    # add message that this is skipped
                    continue
            if fieldToType.get(column, 'string') == 'double':
                try:
                    row[column] = float(row[column])
                except ValueError:
                    # skip this one
                    # add message that this is skipped
                    continue
            row[cleanKey(column)] = row.pop(column)

        data_refined.append(row)

    client = MongoClient(settings.MONGODB_URI)
    db = client.get_database(settings.TOLATABLES_MONGODB_NAME)

    if conf['download_type'] == 'commcare_form':
        for row in data_refined:
            row['edit_date'] = timezone.now()
            row['silo_id'] = conf['silo_id']
            row['read_id'] = conf['read_id']
        db.label_value_store.insert_many(data_refined)
    else:
        if conf['update']:
            if conf['download_type'] == 'case':
                for row in data_refined:
                    row['edit_date'] = timezone.now()
                    db.label_value_store.update(
                        {
                            'silo_id': conf['silo_id'],
                            'case_id': row['case_id']
                        }, {"$set": row},
                        upsert=True)

            elif conf['download_type'] == 'commcare_report':
                silo = Silo.objects.get(pk=conf['silo_id'])
                read = Read.objects.get(pk=conf['read_id'])
                db.label_value_store.delete_many({'silo_id': conf['silo_id']})
                saveDataToSilo(silo, data_refined, read)

        else:
            for row in data_refined:
                row["create_date"] = timezone.now()
                row["silo_id"] = conf['silo_id']
                row["read_id"] = conf['read_id']
            db.label_value_store.insert(data_refined)
예제 #3
0
def import_from_gsheet_helper(user,
                              silo_id,
                              silo_name,
                              spreadsheet_id,
                              sheet_id=None,
                              partialcomplete=False):
    msgs = []

    if spreadsheet_id is None:
        msgs.append({
            "level": messages.ERROR,
            "msg": "A Google Spreadsheet is not selected to import data from.",
            "redirect": reverse('index')
        })

    credential_obj = _get_credential_object(user)
    if not isinstance(credential_obj, OAuth2Credentials):
        msgs.append(credential_obj)
        return msgs

    defaults = {
        "name": silo_name,
        "description": "Google Sheet Import",
        "public": False,
        "owner": user
    }
    silo, created = Silo.objects.get_or_create(
        pk=None if silo_id == '0' else silo_id, defaults=defaults)
    msgs.append({"silo_id": silo.id})

    service = _get_authorized_service(credential_obj)

    # fetch the google spreadsheet metadata
    try:
        spreadsheet = service.spreadsheets().get(
            spreadsheetId=spreadsheet_id).execute()
    except HttpAccessTokenRefreshError:
        return [_get_credential_object(user, True)]
    except Exception as e:
        error = json.loads(e.content).get("error")
        msg = "%s: %s" % (error.get("status"), error.get("message"))
        msgs.append({"level": messages.ERROR, "msg": msg})
        return msgs

    spreadsheet_name = spreadsheet.get("properties", {}).get("title", "")

    gsheet_read = _get_or_create_read("GSheet Import", spreadsheet_name,
                                      "Google Spreadsheet Import",
                                      spreadsheet_id, user, silo)
    sheet_name = "Sheet1"
    if sheet_id:
        gsheet_read.gsheet_id = sheet_id
        gsheet_read.save()

    if gsheet_read.gsheet_id:
        sheets = spreadsheet.get("sheets", None)
        for sheet in sheets:
            properties = sheet.get("properties", None)
            if properties:
                if str(properties.get("sheetId")) == str(
                        gsheet_read.gsheet_id):
                    sheet_name = properties.get("title")

    headers = []
    data = None

    combine_cols = False
    # Fetch data from gsheet
    try:
        result = service.spreadsheets().values().get(
            spreadsheetId=spreadsheet_id, range=sheet_name).execute()
        data = result.get("values", [])
    except Exception as e:
        logger.error(e)
        msgs.append({
            "level": messages.ERROR,
            "msg": "Something went wrong 22: %s" % e,
            "redirect": None
        })
        return msgs

    unique_fields = silo.unique_fields.all()
    skipped_rows = set()
    lvss = []
    for r, row in enumerate(data):
        if r == 0:
            headers = []
            for header in row:
                header = cleanKey(header)
                headers.append(header)

            addColsToSilo(silo, headers)
            continue
        filter_criteria = {}

        # build filter_criteria if unique field(s) have been setup for this silo
        for unique_field in unique_fields:
            try:
                filter_criteria.update(
                    {unique_field.name: row[headers.index(unique_field.name)]})
            except KeyError:
                pass
            except ValueError:
                pass
        if filter_criteria:
            filter_criteria.update({'silo_id': silo.id})
            # if a row is found, then fetch and update it
            # if no row is found then create a new one
            # if multiple rows are found then skip b/c not sure which one to update
            try:
                lvs = LabelValueStore.objects.get(**filter_criteria)
                lvs.edit_date = timezone.now()
            except LabelValueStore.DoesNotExist as e:
                lvs = LabelValueStore()
            except LabelValueStore.MultipleObjectsReturned as e:
                for k, v in filter_criteria.iteritems():
                    skipped_rows.add("%s=%s" % (k, v))
                continue
        else:
            lvs = LabelValueStore()

        for c, col in enumerate(row):
            try:
                key = headers[c]
            except IndexError as e:
                #this happens when a column header is missing gsheet
                continue
            if key == "" or key is None or key == "silo_id": continue
            elif key == "id" or key == "_id": key = "user_assigned_id"
            elif key == "edit_date": key = "editted_date"
            elif key == "create_date": key = "created_date"
            val = smart_str(row[c], strings_only=True)
            key = smart_str(key)
            val = val.strip()
            setattr(lvs, key.replace(".", "_").replace("$", "USD"), val)
        lvs.silo_id = silo.id
        lvs.read_id = gsheet_read.id
        lvs.create_date = timezone.now()
        lvs = calculateFormulaCell(lvs, silo)
        if partialcomplete:
            lvss.append(lvs)
        else:
            lvs.save()

    if skipped_rows:
        msgs.append({
            "level":
            messages.WARNING,
            "msg":
            "Skipped updating/adding records where %s because there are already multiple records."
            % ",".join(str(s) for s in skipped_rows)
        })

    msgs.append({"level": messages.SUCCESS, "msg": "Operation successful"})
    if partialcomplete:
        return (lvss, msgs)
    return msgs
예제 #4
0
def import_from_gsheet_helper(user,
                              silo_id,
                              silo_name,
                              spreadsheet_id,
                              sheet_id=None,
                              partialcomplete=False):
    msgs = []
    if spreadsheet_id is None:
        msgs.append({
            'level': messages.ERROR,
            'msg': 'A Google Spreadsheet is not selected to import data from.',
            'redirect': reverse('index')
        })

    credential_obj = _get_credential_object(user)
    if not isinstance(credential_obj, OAuth2Credentials):
        msgs.append(credential_obj)
        return msgs

    defaults = {
        'name': silo_name,
        'description': 'Google Sheet Import',
        'public': False,
        'owner': user
    }
    silo, created = Silo.objects.get_or_create(
        pk=None if silo_id == '0' else silo_id, defaults=defaults)
    msgs.append({'silo_id': silo.id})

    metadata, error = _get_gsheet_metadata(credential_obj, spreadsheet_id,
                                           user)
    if error:
        if 'credential' in error:
            return error.get('credential')
        else:
            msgs.append(error)
            return msgs

    spreadsheet_name = metadata.get('properties', {}).get('title', '')
    gsheet_read = _get_or_create_read('GSheet Import', spreadsheet_name,
                                      'Google Spreadsheet Import',
                                      spreadsheet_id, user, silo)
    if sheet_id:
        gsheet_read.gsheet_id = sheet_id
        gsheet_read.save()

    sheet_name = 'Sheet1'
    if gsheet_read.gsheet_id:
        sheets = metadata.get('sheets', None)
        for sheet in sheets:
            properties = sheet.get('properties', None)
            if properties:
                if str(properties.get('sheetId')) == str(
                        gsheet_read.gsheet_id):
                    sheet_name = properties.get('title')

    values, error = _fetch_data_gsheet(credential_obj, spreadsheet_id,
                                       sheet_name)
    if error:
        msgs.append(error)
        return msgs

    unique_fields = silo.unique_fields.all()
    skipped_rows = set()
    headers = []
    lvss = []

    # get the column names
    header = values.pop(0)
    for h in header:
        h = cleanKey(h)
        headers.append(h)

    data = _convert_gsheet_data(headers, values)

    for r, row in enumerate(data):
        filter_criteria = {}
        # build filter_criteria if unique field(s) have been setup for this silo
        for uf in unique_fields:
            try:
                if str(row[uf.name]).isdigit():
                    filter_criteria.update({str(uf.name): int(row[uf.name])})
                else:
                    filter_criteria.update({str(uf.name): str(row[uf.name])})
            except AttributeError as e:
                logger.warning(e)
        if filter_criteria:
            filter_criteria.update({'silo_id': silo.id})
            try:
                lvs = LabelValueStore.objects.get(**filter_criteria)
                lvs.edit_date = timezone.now()
            except LabelValueStore.DoesNotExist:
                lvs = LabelValueStore()
            except LabelValueStore.MultipleObjectsReturned:
                for k, v in filter_criteria.iteritems():
                    skipped_rows.add('{}={}'.format(k, v))
                continue
        else:
            lvs = LabelValueStore()

        # add the key and values to the document
        row = clean_data_obj(row)
        for key, val in row.iteritems():
            val = smart_str(val, strings_only=True)
            setattr(lvs, key, val)

        lvs.silo_id = silo.id
        lvs.read_id = gsheet_read.id
        lvs.create_date = timezone.now()
        lvs = calculateFormulaCell(lvs, silo)
        if partialcomplete:
            lvss.append(lvs)
        else:
            lvs.save()
    addColsToSilo(silo, headers)

    if skipped_rows:
        skipped_str = ','.join(str(s) for s in skipped_rows)
        msg = 'Skipped updating/adding records where {} because there are ' \
              'already multiple records.'.format(skipped_str)
        msgs.append({'level': messages.WARNING, 'msg': msg})

    msgs.append({'level': messages.SUCCESS, 'msg': 'Operation successful'})
    if partialcomplete:
        return lvss, msgs
    return msgs
예제 #5
0
    def handle(self, *args, **options):

        frequency = options['frequency']
        if frequency != "daily" and frequency != "weekly":
            return self.stdout.write(
                "Frequency argument can either be 'daily' or 'weekly'")

        silos = Silo.objects.filter(
            reads__autopull_frequency__isnull=False,
            reads__autopull_frequency=frequency,
            reads__type__read_type="CommCare").distinct()
        commcare_type_id = ReadType.objects.get(read_type='CommCare').id
        for silo in silos:
            for read in silo.reads.all():
                if read.type.read_type != 'CommCare':
                    print 'skipping read', read
                    continue
                conf = CommCareImportConfig(silo_id=silo.id,
                                            read_id=read.id,
                                            tables_user_id=silo.owner.pk,
                                            base_url=read.read_url,
                                            update=True)

                try:
                    conf.set_auth_header()
                except Exception as e:
                    self.stdout.write(
                        'No commcare api key for silo %s, read "%s"' %
                        (silo.pk, read.pk))
                    continue

                # get/build metadata based on download_type
                if 'case' in conf.base_url:
                    last_data_retrieved = str(getNewestDataDate(silo.id))[:10]
                    conf.base_url += "&date_modified_start=" + last_data_retrieved
                    conf.download_type = 'case'
                    conf.record_count = get_commcare_record_count(conf)
                elif 'configurablereportdata' in conf.base_url:
                    conf.download_type = 'commcare_report'
                    url_parts = conf.base_url.split('/')
                    conf.project = url_parts[4]
                    conf.report_id = url_parts[8]
                    conf.record_count = get_commcare_record_count(conf)
                elif '/form/' in conf.base_url:
                    cache_obj = CommCareCache.objects.get(
                        form_id=read.resource_id)
                    conf.download_type = 'commcare_form'
                    conf.project = cache_obj.project
                    conf.form_name = cache_obj.form_name
                    conf.form_id = cache_obj.form_id
                    conf.base_url = read.read_url + '&received_on_start=' + cache_obj.last_updated.isoformat(
                    )[:-6]
                    conf.record_count = get_commcare_record_count(conf)

                response = requests.get(conf.base_url,
                                        headers=conf.auth_header)
                if response.status_code == 401:
                    commcare_token.delete()
                    self.stdout.write(
                        'Incorrect commcare api key for silo %s, read %s' %
                        (silo.pk, read.pk))
                    continue
                elif response.status_code != 200:
                    self.stdout.write(
                        'Failure retrieving commcare data for silo %s, read %s'
                        % (silo.pk, read.pk))
                    continue

                if '/form/' in conf.base_url:
                    client = MongoClient(settings.MONGODB_URI)
                    db = client.get_database(settings.TOLATABLES_MONGODB_NAME)
                    db.label_value_store.delete_many({'read_id': read.id})
                    cache_silo = Silo.objects.get(pk=cache_obj.silo.id)
                    copy_from_cache(cache_silo, silo, read)

                #Now call the update data function in commcare tasks
                conf.base_url = read.read_url
                conf.use_token = True

                data_raw = fetchCommCareData(conf.to_dict(), conf.base_url, 0,
                                             50)
                data_collects = data_raw.apply_async()
                new_colnames = set()
                for colset in [set(v.get()) for v in data_collects]:
                    new_colnames.update(colset)
                cleaned_colnames = [cleanKey(name) for name in new_colnames]
                addColsToSilo(silo, cleaned_colnames)

                self.stdout.write(
                    'Successfully fetched silo %s, read %s from CommCare' %
                    (silo.pk, read.pk))
예제 #6
0
    def handle(self, *args, **options):

        disabled_projects = []
        # Get a list of projects to import data from
        if options['project']:
            project = [options['project']]
        else:
            read_urls = Read.objects.filter(type__read_type='CommCare') \
                .values_list('read_url', flat=True)
            projects = set()
            for read_url in read_urls:
                projects.add(read_url.split('/')[4])

        base_url = 'https://www.commcarehq.org/a/%s/api/v0.5/form/' \
            '?limit=1&received_on_start=%s'

        for project in projects:
            if project in disabled_projects:
                continue
            print 'Downloading ' + project

            try:
                cache_obj = CommCareCache.objects.filter(project=project) \
                    .first()
                last_updated = cache_obj.last_updated.isoformat()[:-6]
            except (IndexError, AttributeError):
                last_updated = '1980-01-01'

            conf = CommCareImportConfig(
                project=project,
                tables_user_id=User.objects.get(email='*****@*****.**',
                                                username='******').id,
                base_url=base_url % (project, last_updated),
                download_type='commcare_form',
                update=True,
                use_token=True,
                for_cache=True)

            try:
                conf.set_auth_header()
            except ThirdPartyTokens.DoesNotExist:
                self.stdout.write('CommCare Token is not in the DB')
                continue

            response = requests.get(conf.base_url, headers=conf.auth_header)
            if response.status_code == 401:
                commcare_token = ThirdPartyTokens.objects.get(token=conf.token)
                commcare_token.delete()
                self.stdout.write('Incorrect CommCare api key for project %s' %
                                  project)
                continue
            elif response.status_code != 200:
                self.stdout.write(
                    'Failure retrieving CommCare data for project %s' %
                    project)

            conf.record_count = get_commcare_record_count(conf)
            if conf.record_count == 0:
                self.stdout.write(
                    'Fetching of record counts failed. Skipping project %s' %
                    project)
                continue

            # Trigger the data retrieval and retrieve the column names
            url = conf.base_url.replace('limit=1', 'limit=100')
            data_raw = fetchCommCareData(conf.to_dict(), url, 0, 100)
            data_collects = data_raw.apply_async()
            data_retrieval = [v.get() for v in data_collects]

            # Add columns to the Silo and max date to Cache
            silo_ref = {}
            save_date = datetime.datetime(1980, 1, 1).replace(tzinfo=pytz.UTC)
            for column_dict, max_date in data_retrieval:
                for silo_id in column_dict:
                    cleaned_colnames = [
                        cleanKey(name) for name in column_dict[silo_id]
                    ]
                    if silo_id in silo_ref:
                        silo = silo_ref[silo_id]
                        addColsToSilo(silo, cleaned_colnames)
                    else:
                        try:
                            silo = Silo.objects.get(pk=silo_id)
                            silo_ref[silo_id] = silo
                            addColsToSilo(silo, cleaned_colnames)
                        except ValueError as e:
                            print 'ERROR: Null value for silo_id'
                            print 'Conf object: ', conf
                            print 'Column dict', column_dict
                            print 'Error message: ', e

                save_date = max(
                    parse(max_date).replace(tzinfo=pytz.UTC), save_date)

            for cache_obj in CommCareCache.objects.filter(project=project):
                cache_obj.last_updated = save_date
                cache_obj.save()

            self.stdout.write('Successfully fetched project %s from CommCare' %
                              conf.project)