def process_uploaded_file(upload_id, absolute_filename): ''' parse uploaded files validate schema if data exists in the project save entries to mongodb save file itself to s3 delete local copy pass status and errors to the upload itself ''' # get handles for some objects upload = Upload.objects(id=upload_id)[0] project = upload.project filename = os.path.basename(absolute_filename) # begin parsing the file spreadsheet = xlrd.open_workbook(absolute_filename) worksheet = spreadsheet.sheet_by_index(0) # capture headers at row 0 original_headers = [] for col in range(worksheet.ncols): original_headers.append(worksheet.cell(0, col).value) # convert invalid characters upload_header_names = [] for header in original_headers: header = header.replace('.', '__') header = header.replace('$', '__') upload_header_names.append(header) if upload_header_names != original_headers: message = { 'status': 'info' , 'message': 'Invalid characters such as "$" and "." were ' 'converted to "__".' } upload.update(push__worker_messages = message) # check if imported file schema is subset of project schema # suppose project.ordered_schema is [1,2,3] # an upload with schema [1,2] is accepted # .. [5,6,7] is rejected # .. [1,2,3,4] is accepted with a warning that data from '4' # will be discarded if project.ordered_schema: existing_header_names = [header.name for header in project.ordered_schema] discarded_header_names = [h for h in upload_header_names if h not in existing_header_names] if len(upload_header_names) == len(discarded_header_names): # all of the headers are going to be discarded # reject this upload message = { 'status': 'error' , 'message': 'The file "%s" has nothing in common with the ' 'existing project data and cannot be uploaded' % filename } upload.update(push__worker_messages = message) # clear these values to show the upload failed upload.update(set__filename = None) upload.update(set__uploaded_by = None) return False if discarded_header_names: # some headers will be ignored but the upload can # proceed discards = ', '.join('"%s"' % h for h in \ discarded_header_names) message = { 'status': 'warning' , 'message': 'The data in "%s" has some column headings ' 'that are not currently in the project. Data for the ' 'following headings will not be added to the project: %s.' % (filename, discards) } upload.update(push__worker_messages = message) else: # no project.ordered_schema exists yet discarded_header_names = [] existing_header_names = list(upload_header_names) # capture entries for row in range(worksheet.nrows): values = {} for col in range(worksheet.ncols): if row == 0: # skip headers continue # skip headers that were not in the existing schema header = upload_header_names[col] if header in discarded_header_names: continue cell = worksheet.cell(row, col) # attempt to convert dates to datetime or float value = utilities.smart_cast(cell.value) values[header] = value if values: new_entry = Entry( project = project , upload = upload , values = values ) new_entry.save() # create a hash of the entry values for later de-duping for entry in Entry.objects(upload=upload): m = hashlib.md5() sorted_headers = list(existing_header_names) sorted_headers.sort() for header in sorted_headers: if header not in upload_header_names: continue value = entry.values[header] if type(value) == type(u'unicode'): m.update(value.encode('utf8')) else: m.update(str(value)) entry.update(set__value_hash = m.hexdigest()) if project.ordered_schema: # not first upload so check project for duplicate entries uniques = Entry.objects(project=project , unique=True).only('value_hash') unique_hashes = [u['value_hash'] for u in uniques] # also see if any newly-uploaded entries should be hidden hidden_entries = Entry.objects(project=project , visible=False).only('value_hash') hidden_hashes = [h['value_hash'] for h in hidden_entries] upload_entries = Entry.objects(upload=upload) dupe_count, hidden_count = 0, 0 for entry in upload_entries: if entry['value_hash'] not in unique_hashes: entry.update(set__unique = True) else: entry.update(set__unique = False) dupe_count += 1 if entry['value_hash'] in hidden_hashes: entry.update(set__visible = False) hidden_count += 1 if dupe_count: message = { 'status': 'warning' , 'message': 'Of the %s entries in the uploaded file, ' '%s were duplicates.' % (len(upload_entries), dupe_count) } upload.update(push__worker_messages = message) if hidden_count: message = { 'status': 'warning' , 'message': 'Of the %s entries in the uploaded file, ' '%s were automatically hidden based on prior edits.' % (len(upload_entries), hidden_count) } upload.update(push__worker_messages = message) else: # no schema, so this is the first upload to the project # all entries /should/ be unique.. # unless there are dupes within the upload upload_hashes = [] dupe_count = 0 for entry in Entry.objects(upload=upload): if entry['value_hash'] not in upload_hashes: entry.update(set__unique = True) else: dupe_count += 1 entry.update(set__unique = False) upload_hashes.append(entry['value_hash']) if dupe_count: message = { 'status': 'warning' , 'message': 'There were %s duplicate entries in the' ' uploaded file.' % dupe_count } upload.update(push__worker_messages = message) # define a default schema for the project ordered_schema = [] # try to guess the data type for each header # take the first entry and base guesses off of this data entry = Entry.objects(upload=upload)[0] for header in upload_header_names: value = utilities.smart_cast(entry.values[header]) # compare result of casting to other types if type(value) == datetime.datetime: data_type = 'datetime' elif type(value) == float: data_type = 'number' else: data_type = 'string' new_header = Header( data_type = data_type , display = True , name = header , project = project , label = header ) new_header.save() ordered_schema.append(new_header) # save results project.update(set__ordered_schema = ordered_schema) message = { 'status': 'warning' , 'message': "Please verify the data types in this project's " "schema." } upload.update(push__worker_messages = message) # send the uploaded file to s3 connection = boto.connect_s3( aws_access_key_id = app.config['AWS']['access_key_id'] , aws_secret_access_key = app.config['AWS']['secret_access_key']) bucket_name = '%s-%s' % (app.config['UPLOADS_BUCKET'] , app.config['AWS']['access_key_id']) bucket = connection.create_bucket(bucket_name.lower()) s3_key = S3_Key(bucket) s3_key.key = '%s-%s' % (filename, utilities.generate_random_string(6)) s3_key.set_contents_from_filename(absolute_filename) # save the data upload.update(set__extension = filename.rsplit('.', 1)[1]) upload.update(set__s3_key = s3_key.key) upload.update(set__upload_time = datetime.datetime.utcnow()) upload.update(set__headers = existing_header_names) # remove the uploaded file from the local temp location os.unlink(absolute_filename)
def filters(org_label, project_label, filter_label): ''' creating filters for various purposes /organizations/aquaya/projects/water-quality/filters?create=true : create a new filter config, immediately redirect to editing /organizations/aquaya/projects/water-quality/filters/big-cities : view a filter /organizations/aquaya/projects/water-quality/filters/big-cities?edit=true : edit a filter; accepts GET or POST ''' user = User.objects(email=session['email'])[0] orgs = Organization.objects(label=org_label) if not orgs: flash('Organization "%s" not found, sorry!' % org_label, 'warning') return redirect(url_for('organizations')) org = orgs[0] # permission-check if org not in user.organizations and not user.admin_rights: app.logger.error('%s tried to view a project but was \ denied for want of admin rights' % session['email']) abort(404) # find the project projects = Project.objects(label=project_label, organization=org) if not projects: flash('Project "%s" not found, sorry!' % project_label, 'warning') return redirect(url_for('organizations', org_label=org.label)) project = projects[0] if request.method == 'POST': # we have a filter_label filters= Filter.objects(label=filter_label, project=project) if not filters: abort(404) # should try to avoid overriding the builtin.. filter = filters[0] form_type = request.form.get('form_type', '') if form_type == 'info': if filter.name != request.form.get('name', ''): name = request.form.get('name', '') filter.name = name filters = Filter.objects(project=project).only('label') labels = [f.label for f in filters] filter.label = utilities.generate_label(name, labels) filter.description = request.form.get('description', '') filter.comparison = request.form.get('comparison', '') # header of the format header_id__4abcd0001 header_id = request.form.get('header', '') header_id = header_id.split('header_id__')[1] header = Header.objects(id=header_id)[0] filter.header = header if header.data_type == 'datetime': # check if relative or absolute datetime comparison if filter.comparison in \ constants.filter_comparisons['datetime_absolute']: compare_to = '%s %s' % ( request.form.get('date_compare_to', '') , request.form.get('time_compare_to', '')) try: dt = datetime.datetime.strptime( compare_to, '%B %d, %Y %I:%M %p') filter.compare_to = {'value': dt} except: flash("didn't understand that date formatting; make \ sure it looks like 'June 21, 2012' and \ '02:45 PM'", 'error') return redirect(url_for('filters', org_label=org.label , project_label=project.label , filter_label=filter.label , edit='true')) elif filter.comparison in \ constants.filter_comparisons['datetime_relative']: inputs = [request.form.get('relative_years', '') , request.form.get('relative_months', '') , request.form.get('relative_weeks', '') , request.form.get('relative_days', '') , request.form.get('relative_hours', '') , request.form.get('relative_minutes', '')] parsed_inputs = [] for i in inputs: if i: parsed_inputs.append(i) else: parsed_inputs.append(0) # verify that we have numbers try: values = [int(i) for i in parsed_inputs] except: flash('input times must be whole numbers', 'error') return redirect(url_for('filters', org_label=org.label , project_label=project.label , filter_label=filter.label , edit='true')) filter.compare_to = {'value': values} else: app.logger.info('unknown comparison "%s" for project %s' \ % (comparison, project.name)) abort(404) else: # not datetime; cast the value we're comparing against compare_to = request.form.get('string_compare_to', '') filter.compare_to = {'value': utilities.smart_cast(compare_to)} elif form_type == 'admin': # delete the filter name = filter.name # pull filters out of the statistics statistics = Statistic.objects(filters=filter) for statistic in statistics: statistic.update(pull__filters=filter) # pull filters out of the graphs graphs = Graph.objects(filters=filter) for graph in graphs: graph.update(pull__filters=filter) filter.delete() app.logger.info('%s deleted filter "%s"' % \ (session['email'], name)) flash('filter "%s" was deleted successfully' % name, 'success') return redirect(url_for('filters', org_label=org.label , project_label=project.label)) else: # bad 'form_type' abort(404) try: filter.save() flash('changes saved successfully', 'success') return redirect(url_for('filters', org_label=org.label , project_label=project.label, filter_label=filter.label)) except: app.logger.error('%s experienced an error saving info about %s' % ( session['email'], request.form['name'])) flash('Error saving changes -- make sure filter names are unique.' , 'error') return redirect(url_for('filters', org_label=org.label , project_label=project.label, filter_label=filter_label , edit='true')) if request.method == 'GET': if filter_label: filters = Filter.objects(label=filter_label, project=project) if not filters: app.logger.error('%s tried to access a filter that does not \ exist' % session['email']) flash('Filter "%s" not found, sorry!' % filter_label , 'warning') return redirect(url_for('projects', org_label=org.label , project_label=project.label)) filter = filters[0] if request.args.get('edit', '') == 'true': # valid comparisons comparisons = json.dumps(constants.filter_comparisons) # list of allowed absolute datetime comparisons absolute_comparisons = \ constants.filter_comparisons['datetime_absolute'] relative_values = None if filter.comparison in \ constants.filter_comparisons['datetime_relative']: relative_values = filter.compare_to['value'] return render_template('filter_edit.html', filter=filter , comparisons=comparisons , absolute_datetime_comparisons=absolute_comparisons , relative_datetime_compare_values=relative_values) else: # apply the filter; start with some defaults conditions = { 'project': project , 'unique': True , 'visible': True } entries = Entry.objects(**conditions) project_count = len(entries) if not filter.comparison: return render_template('filter.html', filter=filter , project_count=project_count) # find all matches for this filter to display matches = utilities.apply_filter(filter, entries) # sort the matches matches = utilities.sort_entries(project, matches) # pagination entries_per_page = 10 pages = utilities.calculate_pages(len(matches) , entries_per_page=entries_per_page) # validate the requested page current_page = utilities.validate_page_request( request.args.get('page', 1), pages) # manually paginate start_index = entries_per_page * (current_page - 1) end_index = start_index + entries_per_page paginated_matches = matches[start_index:end_index] # list of allowed absolute datetime comparisons absolute_comparisons = \ constants.filter_comparisons['datetime_absolute'] # if the filter is for a relative datetime, parse the value relative_value = None if filter.comparison in \ constants.filter_comparisons['datetime_relative']: # filter.compare_to is a list [year, month, etc..] # first non zero value will be used for i in range(len(filter.compare_to['value'])): if filter.compare_to['value'][i] != 0: break # values in the compare_to list periods = ['year', 'month', 'week', 'day', 'hour' , 'minute'] # see if we need to make it plural suffix = '' if filter.compare_to['value'][i] > 1: suffix = 's' relative_value = '%s %s%s' \ % (filter.compare_to['value'][i], periods[i], suffix) return render_template('filter.html', filter=filter , entries = paginated_matches , total_matches = len(matches) , project_count=project_count , current_page = current_page , number_of_pages = pages , absolute_datetime_comparisons=absolute_comparisons , relative_datetime_value=relative_value) if request.args.get('create', '') == 'true': # create a new filter # CSRF validation token = request.args.get('token', '') if not verify_token(token): abort(403) try: filter_name = 'filter-%s' % utilities.generate_random_string(6) new_filter= Filter( creation_time = datetime.datetime.utcnow() , creator = user , label = filter_name.lower() , project = project , name = filter_name ) new_filter.save() app.logger.info('filter created by %s' % session['email']) flash('filter created; please change the defaults', 'success') except: app.logger.error('filter creation failed for %s' % \ session['email']) flash('There was an error, sorry :/', 'error') return redirect(url_for('projects', org_label=org.label , project=project.label)) # redirect to the editing screen return redirect(url_for('filters', org_label=org.label , project_label=project.label, filter_label=new_filter.label , edit='true')) # no filter in particular was specified, show 'em all filters = Filter.objects(project=project) # list of allowed absolute datetime comparisons absolute_comparisons = \ constants.filter_comparisons['datetime_absolute'] relative_values = {} for filter in filters: # if the filter is for a relative datetime, parse the value relative_value = None if filter.comparison in \ constants.filter_comparisons['datetime_relative']: # filter.compare_to is a list [year, month, etc..] # first non zero value will be used for i in range(len(filter.compare_to['value'])): if filter.compare_to['value'][i] != 0: break # values in the compare_to list periods = ['year', 'month', 'week', 'day', 'hour' , 'minute'] # see if we need to make it plural suffix = '' if filter.compare_to['value'][i] > 1: suffix = 's' relative_value = '%s %s%s' % ( filter.compare_to['value'][i], periods[i], suffix) relative_values[filter.name] = relative_value return render_template('project_filters.html', project=project , filters=filters , absolute_datetime_comparisons=absolute_comparisons , relative_datetime_values=relative_values)
def connect_to_source(connection_id): ''' connects to commcare and pulls new data need to handle auth failures ''' connections = Connection.objects(id=connection_id) if not connections: # the connection was deleted while the job waited return False connection = connections[0] project = connection.project # validate the schedule config if not ready_to_connect(connection): print 'bad connection config for "%s"' % connection.name return False # create a new log entry new_log_entry = ConnectionLogEntry( initiation_time = datetime.datetime.utcnow() , project = project , service_type = connection.service_type) new_log_entry.save() # gather data from the service # assumed to be commcare at the moment url = 'https://www.commcarehq.org/a/%s/reports/export/' % connection.domain payload = { 'export_tag': '"%s"' % connection.export_tag , 'format': 'json' } if connection.include_admin: payload['ufilter'] = '2' # include a token in the next request and commcare will only send updates if connection.latest_export_token: payload['previous_export'] = connection.latest_export_token credentials = HTTPDigestAuth(connection.credentials['username'] , connection.credentials['password']) # start the actual request r = requests.get(url, auth=credentials, params=payload) # if there is no new data, r.json will be None if r.status_code == 200 and r.json: data = r.json['#'] headers = data['headers'] # convert invalid characters in headers converted_headers = [] for header in headers: header = header.replace('.', '__') header = header.replace('$', '__') converted_headers.append(header) if converted_headers != headers: print 'Invalid characters such as "$" and "." were converted to \ "__".' # carry on with the converted headers headers = converted_headers[:] project_headers = [header.name for header in project.ordered_schema] # check if schema of response is subset of project schema # suppose project.ordered_schema is [1,2,3] # an upload with schema [1,2] is accepted # .. [5,6,7] is rejected # .. [1,2,3,4] is accepted with a warning that data from '4' will be \ # discarded if project.ordered_schema: if not connection.include_metadata: # response headers that are metadata will be disregarded scrubbed_headers = [] for header in headers: # returns None if header should be discarded header = _clean_commcare_metadata_header(header) if header: scrubbed_headers.append(header) else: scrubbed_headers = headers # headers discarded_headers = [h for h in scrubbed_headers \ if h not in project_headers] if len(scrubbed_headers) == len(discarded_headers): # all of the headers are going to be discarded # reject this response print 'Data that was received does not match the existing \ project schema and cannot be saved.' # abort return False if discarded_headers: # some headers will be ignored discards = ', '.join('"%s"' % h for h in discarded_headers) print 'The data in this response has some column headings \ that are not currently in the project. Data for these \ headers will not be added to the project: %s.' % discards else: # no project.ordered_schema discarded_headers = [] # capture entries for row in data['rows']: values = {} # each row is just a list for i, value in enumerate(row): header = headers[i] # check to see if we should include metadata if not connection.include_metadata: # drop all headers that are commcare metadata header = _clean_commcare_metadata_header(header) # returns None if header is metadata if not header: continue # ignore headers that were not in the ordered_schema if header in discarded_headers: continue # attempt to convert values to a more sane type values[header] = utilities.smart_cast(value) if values: new_entry = Entry( project = project , connection_log = new_log_entry , values = values ) new_entry.save() # for the log, track how many entries we've received records_received = len(data['rows']) # prepare for de-duplication # get set of headers from the keys of the last 'values' sorted_headers = values.keys() sorted_headers.sort() # create a hash of the entry values for later de-duping for entry in Entry.objects(connection_log=new_log_entry): m = hashlib.md5() for header in sorted_headers: value = entry.values[header] if type(value) == type(u'unicode'): m.update(value.encode('utf8')) else: m.update(str(value)) entry.update(set__value_hash = m.hexdigest()) if project.ordered_schema: # data has been uploaded before # so check this submission for duplicates uniques = Entry.objects(project=project, unique=True) unique_hashes = [u['value_hash'] for u in uniques] # also see if any newly-uploaded entries should be hidden hidden_entries = Entry.objects(project=project , visible=False) hidden_hashes = [h['value_hash'] for h in hidden_entries] connection_entries = Entry.objects(connection_log=new_log_entry) for entry in connection_entries: if entry['value_hash'] not in unique_hashes: entry.update(set__unique = True) else: entry.update(set__unique = False) if entry['value_hash'] in hidden_hashes: entry.update(set__visible = False) else: # no project schema has been defined # so this is the first dataset for entry in Entry.objects(connection_log=new_log_entry): entry.update(set__unique = True) # ('headers' at this stage is not metadata-less # it has only had invalid chars removed) scrubbed_headers = [] for header in headers: # check to see if we should include metadata headers if not connection.include_metadata: # drop all headers that are commcare metadata header = _clean_commcare_metadata_header(header) # returns None if header is metadata if not header: continue scrubbed_headers.append(header) # continue with this set headers = scrubbed_headers[:] # define a default schema for the project ordered_schema = [] # try to guess the data type for each header # take the first entry and base guesses off of this data entry = Entry.objects(connection_log=new_log_entry)[0] for header in headers: # attempt to cast the value value = utilities.smart_cast(entry.values[header]) # compare result of casting to other types if type(value) == type(datetime.datetime.utcnow()): data_type = 'datetime' elif type(value) == type(float(2)): data_type = 'number' else: data_type = 'string' new_header = Header( data_type = data_type , display = True , name = header , project = project , label = header ) new_header.save() ordered_schema.append(new_header) # save results project.update(set__ordered_schema = ordered_schema) # save the response export token for next time # allows us to only get updates if 'x-commcarehq-export-token' in r.headers.keys(): connection.update(set__latest_export_token = \ r.headers['x-commcarehq-export-token']) elif r.json == None: # token included in request and there are no updates # so no json is sent records_received = 0 # update the log entry new_log_entry.update(set__http_status_code = str(r.status_code)) new_log_entry.update(set__records_received = records_received) new_log_entry.update(set__completion_time = datetime.datetime.utcnow()) # bump the schedule's last run time repeating_task = connection.schedule repeating_task.update(set__last_run_time = datetime.datetime.utcnow()) # and schedule the next job update_scheduled_connection(connection)