def test_clean_keys(self): LabelValueStore.objects(silo_id=self.silo.id).delete() lvs = LabelValueStore() orig_data = { 'Header 1': 'r1c1', 'create_date': 'r1c3', 'edit_date': 'r1c2', '_id': 'r1c4' } for k, v in orig_data.iteritems(): key = cleanKey(k) val = smart_str(v, strings_only=True) key = smart_str(key) val = val.strip() setattr(lvs, key, val) lvs.silo_id = self.silo.id lvs.save() returned_data = json.loads(LabelValueStore.objects( silo_id=self.silo.id).to_json())[0] returned_data.pop('_id') expected_data = { 'Header 1': 'r1c1', 'created_date': 'r1c3', 'editted_date': 'r1c2', 'user_assigned_id': 'r1c4', 'read_id': -1, 'silo_id': self.silo.id } self.assertEqual(returned_data, expected_data) LabelValueStore.objects(silo_id=self.silo.id).delete()
def storeCommCareData(conf, data): data_refined = [] try: fieldToType = getColToTypeDict(Silo.objects.get(pk=conf['silo_id'])) except Silo.DoesNotExist: fieldToType = {} for row in data: for column in row: if fieldToType.get(column, 'string') == 'int': try: row[column] = int(row[column]) except ValueError: # skip this one # add message that this is skipped continue if fieldToType.get(column, 'string') == 'double': try: row[column] = float(row[column]) except ValueError: # skip this one # add message that this is skipped continue row[cleanKey(column)] = row.pop(column) data_refined.append(row) client = MongoClient(settings.MONGODB_URI) db = client.get_database(settings.TOLATABLES_MONGODB_NAME) if conf['download_type'] == 'commcare_form': for row in data_refined: row['edit_date'] = timezone.now() row['silo_id'] = conf['silo_id'] row['read_id'] = conf['read_id'] db.label_value_store.insert_many(data_refined) else: if conf['update']: if conf['download_type'] == 'case': for row in data_refined: row['edit_date'] = timezone.now() db.label_value_store.update( { 'silo_id': conf['silo_id'], 'case_id': row['case_id'] }, {"$set": row}, upsert=True) elif conf['download_type'] == 'commcare_report': silo = Silo.objects.get(pk=conf['silo_id']) read = Read.objects.get(pk=conf['read_id']) db.label_value_store.delete_many({'silo_id': conf['silo_id']}) saveDataToSilo(silo, data_refined, read) else: for row in data_refined: row["create_date"] = timezone.now() row["silo_id"] = conf['silo_id'] row["read_id"] = conf['read_id'] db.label_value_store.insert(data_refined)
def import_from_gsheet_helper(user, silo_id, silo_name, spreadsheet_id, sheet_id=None, partialcomplete=False): msgs = [] if spreadsheet_id is None: msgs.append({ "level": messages.ERROR, "msg": "A Google Spreadsheet is not selected to import data from.", "redirect": reverse('index') }) credential_obj = _get_credential_object(user) if not isinstance(credential_obj, OAuth2Credentials): msgs.append(credential_obj) return msgs defaults = { "name": silo_name, "description": "Google Sheet Import", "public": False, "owner": user } silo, created = Silo.objects.get_or_create( pk=None if silo_id == '0' else silo_id, defaults=defaults) msgs.append({"silo_id": silo.id}) service = _get_authorized_service(credential_obj) # fetch the google spreadsheet metadata try: spreadsheet = service.spreadsheets().get( spreadsheetId=spreadsheet_id).execute() except HttpAccessTokenRefreshError: return [_get_credential_object(user, True)] except Exception as e: error = json.loads(e.content).get("error") msg = "%s: %s" % (error.get("status"), error.get("message")) msgs.append({"level": messages.ERROR, "msg": msg}) return msgs spreadsheet_name = spreadsheet.get("properties", {}).get("title", "") gsheet_read = _get_or_create_read("GSheet Import", spreadsheet_name, "Google Spreadsheet Import", spreadsheet_id, user, silo) sheet_name = "Sheet1" if sheet_id: gsheet_read.gsheet_id = sheet_id gsheet_read.save() if gsheet_read.gsheet_id: sheets = spreadsheet.get("sheets", None) for sheet in sheets: properties = sheet.get("properties", None) if properties: if str(properties.get("sheetId")) == str( gsheet_read.gsheet_id): sheet_name = properties.get("title") headers = [] data = None combine_cols = False # Fetch data from gsheet try: result = service.spreadsheets().values().get( spreadsheetId=spreadsheet_id, range=sheet_name).execute() data = result.get("values", []) except Exception as e: logger.error(e) msgs.append({ "level": messages.ERROR, "msg": "Something went wrong 22: %s" % e, "redirect": None }) return msgs unique_fields = silo.unique_fields.all() skipped_rows = set() lvss = [] for r, row in enumerate(data): if r == 0: headers = [] for header in row: header = cleanKey(header) headers.append(header) addColsToSilo(silo, headers) continue filter_criteria = {} # build filter_criteria if unique field(s) have been setup for this silo for unique_field in unique_fields: try: filter_criteria.update( {unique_field.name: row[headers.index(unique_field.name)]}) except KeyError: pass except ValueError: pass if filter_criteria: filter_criteria.update({'silo_id': silo.id}) # if a row is found, then fetch and update it # if no row is found then create a new one # if multiple rows are found then skip b/c not sure which one to update try: lvs = LabelValueStore.objects.get(**filter_criteria) lvs.edit_date = timezone.now() except LabelValueStore.DoesNotExist as e: lvs = LabelValueStore() except LabelValueStore.MultipleObjectsReturned as e: for k, v in filter_criteria.iteritems(): skipped_rows.add("%s=%s" % (k, v)) continue else: lvs = LabelValueStore() for c, col in enumerate(row): try: key = headers[c] except IndexError as e: #this happens when a column header is missing gsheet continue if key == "" or key is None or key == "silo_id": continue elif key == "id" or key == "_id": key = "user_assigned_id" elif key == "edit_date": key = "editted_date" elif key == "create_date": key = "created_date" val = smart_str(row[c], strings_only=True) key = smart_str(key) val = val.strip() setattr(lvs, key.replace(".", "_").replace("$", "USD"), val) lvs.silo_id = silo.id lvs.read_id = gsheet_read.id lvs.create_date = timezone.now() lvs = calculateFormulaCell(lvs, silo) if partialcomplete: lvss.append(lvs) else: lvs.save() if skipped_rows: msgs.append({ "level": messages.WARNING, "msg": "Skipped updating/adding records where %s because there are already multiple records." % ",".join(str(s) for s in skipped_rows) }) msgs.append({"level": messages.SUCCESS, "msg": "Operation successful"}) if partialcomplete: return (lvss, msgs) return msgs
def import_from_gsheet_helper(user, silo_id, silo_name, spreadsheet_id, sheet_id=None, partialcomplete=False): msgs = [] if spreadsheet_id is None: msgs.append({ 'level': messages.ERROR, 'msg': 'A Google Spreadsheet is not selected to import data from.', 'redirect': reverse('index') }) credential_obj = _get_credential_object(user) if not isinstance(credential_obj, OAuth2Credentials): msgs.append(credential_obj) return msgs defaults = { 'name': silo_name, 'description': 'Google Sheet Import', 'public': False, 'owner': user } silo, created = Silo.objects.get_or_create( pk=None if silo_id == '0' else silo_id, defaults=defaults) msgs.append({'silo_id': silo.id}) metadata, error = _get_gsheet_metadata(credential_obj, spreadsheet_id, user) if error: if 'credential' in error: return error.get('credential') else: msgs.append(error) return msgs spreadsheet_name = metadata.get('properties', {}).get('title', '') gsheet_read = _get_or_create_read('GSheet Import', spreadsheet_name, 'Google Spreadsheet Import', spreadsheet_id, user, silo) if sheet_id: gsheet_read.gsheet_id = sheet_id gsheet_read.save() sheet_name = 'Sheet1' if gsheet_read.gsheet_id: sheets = metadata.get('sheets', None) for sheet in sheets: properties = sheet.get('properties', None) if properties: if str(properties.get('sheetId')) == str( gsheet_read.gsheet_id): sheet_name = properties.get('title') values, error = _fetch_data_gsheet(credential_obj, spreadsheet_id, sheet_name) if error: msgs.append(error) return msgs unique_fields = silo.unique_fields.all() skipped_rows = set() headers = [] lvss = [] # get the column names header = values.pop(0) for h in header: h = cleanKey(h) headers.append(h) data = _convert_gsheet_data(headers, values) for r, row in enumerate(data): filter_criteria = {} # build filter_criteria if unique field(s) have been setup for this silo for uf in unique_fields: try: if str(row[uf.name]).isdigit(): filter_criteria.update({str(uf.name): int(row[uf.name])}) else: filter_criteria.update({str(uf.name): str(row[uf.name])}) except AttributeError as e: logger.warning(e) if filter_criteria: filter_criteria.update({'silo_id': silo.id}) try: lvs = LabelValueStore.objects.get(**filter_criteria) lvs.edit_date = timezone.now() except LabelValueStore.DoesNotExist: lvs = LabelValueStore() except LabelValueStore.MultipleObjectsReturned: for k, v in filter_criteria.iteritems(): skipped_rows.add('{}={}'.format(k, v)) continue else: lvs = LabelValueStore() # add the key and values to the document row = clean_data_obj(row) for key, val in row.iteritems(): val = smart_str(val, strings_only=True) setattr(lvs, key, val) lvs.silo_id = silo.id lvs.read_id = gsheet_read.id lvs.create_date = timezone.now() lvs = calculateFormulaCell(lvs, silo) if partialcomplete: lvss.append(lvs) else: lvs.save() addColsToSilo(silo, headers) if skipped_rows: skipped_str = ','.join(str(s) for s in skipped_rows) msg = 'Skipped updating/adding records where {} because there are ' \ 'already multiple records.'.format(skipped_str) msgs.append({'level': messages.WARNING, 'msg': msg}) msgs.append({'level': messages.SUCCESS, 'msg': 'Operation successful'}) if partialcomplete: return lvss, msgs return msgs
def handle(self, *args, **options): frequency = options['frequency'] if frequency != "daily" and frequency != "weekly": return self.stdout.write( "Frequency argument can either be 'daily' or 'weekly'") silos = Silo.objects.filter( reads__autopull_frequency__isnull=False, reads__autopull_frequency=frequency, reads__type__read_type="CommCare").distinct() commcare_type_id = ReadType.objects.get(read_type='CommCare').id for silo in silos: for read in silo.reads.all(): if read.type.read_type != 'CommCare': print 'skipping read', read continue conf = CommCareImportConfig(silo_id=silo.id, read_id=read.id, tables_user_id=silo.owner.pk, base_url=read.read_url, update=True) try: conf.set_auth_header() except Exception as e: self.stdout.write( 'No commcare api key for silo %s, read "%s"' % (silo.pk, read.pk)) continue # get/build metadata based on download_type if 'case' in conf.base_url: last_data_retrieved = str(getNewestDataDate(silo.id))[:10] conf.base_url += "&date_modified_start=" + last_data_retrieved conf.download_type = 'case' conf.record_count = get_commcare_record_count(conf) elif 'configurablereportdata' in conf.base_url: conf.download_type = 'commcare_report' url_parts = conf.base_url.split('/') conf.project = url_parts[4] conf.report_id = url_parts[8] conf.record_count = get_commcare_record_count(conf) elif '/form/' in conf.base_url: cache_obj = CommCareCache.objects.get( form_id=read.resource_id) conf.download_type = 'commcare_form' conf.project = cache_obj.project conf.form_name = cache_obj.form_name conf.form_id = cache_obj.form_id conf.base_url = read.read_url + '&received_on_start=' + cache_obj.last_updated.isoformat( )[:-6] conf.record_count = get_commcare_record_count(conf) response = requests.get(conf.base_url, headers=conf.auth_header) if response.status_code == 401: commcare_token.delete() self.stdout.write( 'Incorrect commcare api key for silo %s, read %s' % (silo.pk, read.pk)) continue elif response.status_code != 200: self.stdout.write( 'Failure retrieving commcare data for silo %s, read %s' % (silo.pk, read.pk)) continue if '/form/' in conf.base_url: client = MongoClient(settings.MONGODB_URI) db = client.get_database(settings.TOLATABLES_MONGODB_NAME) db.label_value_store.delete_many({'read_id': read.id}) cache_silo = Silo.objects.get(pk=cache_obj.silo.id) copy_from_cache(cache_silo, silo, read) #Now call the update data function in commcare tasks conf.base_url = read.read_url conf.use_token = True data_raw = fetchCommCareData(conf.to_dict(), conf.base_url, 0, 50) data_collects = data_raw.apply_async() new_colnames = set() for colset in [set(v.get()) for v in data_collects]: new_colnames.update(colset) cleaned_colnames = [cleanKey(name) for name in new_colnames] addColsToSilo(silo, cleaned_colnames) self.stdout.write( 'Successfully fetched silo %s, read %s from CommCare' % (silo.pk, read.pk))
def handle(self, *args, **options): disabled_projects = [] # Get a list of projects to import data from if options['project']: project = [options['project']] else: read_urls = Read.objects.filter(type__read_type='CommCare') \ .values_list('read_url', flat=True) projects = set() for read_url in read_urls: projects.add(read_url.split('/')[4]) base_url = 'https://www.commcarehq.org/a/%s/api/v0.5/form/' \ '?limit=1&received_on_start=%s' for project in projects: if project in disabled_projects: continue print 'Downloading ' + project try: cache_obj = CommCareCache.objects.filter(project=project) \ .first() last_updated = cache_obj.last_updated.isoformat()[:-6] except (IndexError, AttributeError): last_updated = '1980-01-01' conf = CommCareImportConfig( project=project, tables_user_id=User.objects.get(email='*****@*****.**', username='******').id, base_url=base_url % (project, last_updated), download_type='commcare_form', update=True, use_token=True, for_cache=True) try: conf.set_auth_header() except ThirdPartyTokens.DoesNotExist: self.stdout.write('CommCare Token is not in the DB') continue response = requests.get(conf.base_url, headers=conf.auth_header) if response.status_code == 401: commcare_token = ThirdPartyTokens.objects.get(token=conf.token) commcare_token.delete() self.stdout.write('Incorrect CommCare api key for project %s' % project) continue elif response.status_code != 200: self.stdout.write( 'Failure retrieving CommCare data for project %s' % project) conf.record_count = get_commcare_record_count(conf) if conf.record_count == 0: self.stdout.write( 'Fetching of record counts failed. Skipping project %s' % project) continue # Trigger the data retrieval and retrieve the column names url = conf.base_url.replace('limit=1', 'limit=100') data_raw = fetchCommCareData(conf.to_dict(), url, 0, 100) data_collects = data_raw.apply_async() data_retrieval = [v.get() for v in data_collects] # Add columns to the Silo and max date to Cache silo_ref = {} save_date = datetime.datetime(1980, 1, 1).replace(tzinfo=pytz.UTC) for column_dict, max_date in data_retrieval: for silo_id in column_dict: cleaned_colnames = [ cleanKey(name) for name in column_dict[silo_id] ] if silo_id in silo_ref: silo = silo_ref[silo_id] addColsToSilo(silo, cleaned_colnames) else: try: silo = Silo.objects.get(pk=silo_id) silo_ref[silo_id] = silo addColsToSilo(silo, cleaned_colnames) except ValueError as e: print 'ERROR: Null value for silo_id' print 'Conf object: ', conf print 'Column dict', column_dict print 'Error message: ', e save_date = max( parse(max_date).replace(tzinfo=pytz.UTC), save_date) for cache_obj in CommCareCache.objects.filter(project=project): cache_obj.last_updated = save_date cache_obj.save() self.stdout.write('Successfully fetched project %s from CommCare' % conf.project)