Exemplo n.º 1
0
def process_uploaded_file(upload_id, absolute_filename):
    ''' parse uploaded files
    validate schema if data exists in the project
    save entries to mongodb
    save file itself to s3
    delete local copy
    pass status and errors to the upload itself
    '''
    # get handles for some objects
    upload = Upload.objects(id=upload_id)[0]
    project = upload.project
    filename = os.path.basename(absolute_filename)

    # begin parsing the file
    spreadsheet = xlrd.open_workbook(absolute_filename)
    worksheet = spreadsheet.sheet_by_index(0)

    # capture headers at row 0
    original_headers = []
    for col in range(worksheet.ncols):
        original_headers.append(worksheet.cell(0, col).value)

    # convert invalid characters
    upload_header_names = []
    for header in original_headers:
        header = header.replace('.', '__')
        header = header.replace('$', '__')
        upload_header_names.append(header)

    if upload_header_names != original_headers:
        message = {
            'status': 'info'
            , 'message': 'Invalid characters such as "$" and "." were '
            'converted to "__".'
        }
        upload.update(push__worker_messages = message)

    # check if imported file schema is subset of project schema
    # suppose project.ordered_schema is [1,2,3]
    # an upload with schema [1,2] is accepted
    # .. [5,6,7] is rejected
    # .. [1,2,3,4] is accepted with a warning that data from '4' 
    # will be discarded
    if project.ordered_schema:
        existing_header_names = [header.name for header in
            project.ordered_schema]

        discarded_header_names = [h for h in upload_header_names
            if h not in existing_header_names]

        if len(upload_header_names) == len(discarded_header_names):
            # all of the headers are going to be discarded
            # reject this upload
            message = {
                'status': 'error'
                , 'message': 'The file "%s" has nothing in common with the '
                    'existing project data and cannot be uploaded' 
                    % filename
            }
            upload.update(push__worker_messages = message)

            # clear these values to show the upload failed
            upload.update(set__filename = None)
            upload.update(set__uploaded_by = None)

            return False

        if discarded_header_names:
            # some headers will be ignored but the upload can 
            # proceed
            discards = ', '.join('"%s"' % h for h in \
                discarded_header_names)
            message = {
                'status': 'warning'
                , 'message': 'The data in "%s" has some column headings '
                'that are not currently in the project.  Data for the '
                'following headings will not be added to the project: %s.' 
                % (filename, discards)
            }
            upload.update(push__worker_messages = message)
    else:
        # no project.ordered_schema exists yet
        discarded_header_names = []
        existing_header_names = list(upload_header_names)

    # capture entries
    for row in range(worksheet.nrows):
        values = {}
        for col in range(worksheet.ncols):
            if row == 0:
                # skip headers
                continue
            
            # skip headers that were not in the existing schema
            header = upload_header_names[col]
            if header in discarded_header_names:
                continue

            cell = worksheet.cell(row, col)
            # attempt to convert dates to datetime or float
            value = utilities.smart_cast(cell.value)

            values[header] = value

        if values:
            new_entry = Entry(
                project = project
                , upload = upload
                , values = values
            )
            new_entry.save()

    # create a hash of the entry values for later de-duping
    for entry in Entry.objects(upload=upload):
        m = hashlib.md5()
        sorted_headers = list(existing_header_names)
        sorted_headers.sort()
        for header in sorted_headers: 
            if header not in upload_header_names:
                continue
            value = entry.values[header]
            if type(value) == type(u'unicode'):
                m.update(value.encode('utf8'))
            else:
                m.update(str(value))
        entry.update(set__value_hash = m.hexdigest())
    
    if project.ordered_schema:
        # not first upload so check project for duplicate entries
        uniques = Entry.objects(project=project
                , unique=True).only('value_hash')
        unique_hashes = [u['value_hash'] for u in uniques]
        
        # also see if any newly-uploaded entries should be hidden
        hidden_entries = Entry.objects(project=project
                , visible=False).only('value_hash')
        hidden_hashes = [h['value_hash'] for h in hidden_entries]

        upload_entries = Entry.objects(upload=upload)
        dupe_count, hidden_count = 0, 0
        for entry in upload_entries:
            if entry['value_hash'] not in unique_hashes:
                entry.update(set__unique = True)
            else:
                entry.update(set__unique = False)
                dupe_count += 1
            
            if entry['value_hash'] in hidden_hashes:
                entry.update(set__visible = False)
                hidden_count += 1

        if dupe_count:
            message = {
                'status': 'warning'
                , 'message': 'Of the %s entries in the uploaded file, '
                    '%s were duplicates.' % (len(upload_entries), dupe_count)
            }
            upload.update(push__worker_messages = message)

        if hidden_count:
            message = {
                'status': 'warning'
                , 'message': 'Of the %s entries in the uploaded file, '
                    '%s were automatically hidden based on prior edits.' 
                    % (len(upload_entries), hidden_count)
            }
            upload.update(push__worker_messages = message)

    else:
        # no schema, so this is the first upload to the project

        # all entries /should/ be unique..
        # unless there are dupes within the upload
        upload_hashes = []
        dupe_count = 0
        for entry in Entry.objects(upload=upload):
            if entry['value_hash'] not in upload_hashes:
                entry.update(set__unique = True)
            else:
                dupe_count += 1 
                entry.update(set__unique = False)

            upload_hashes.append(entry['value_hash'])

        if dupe_count:
            message = {
                'status': 'warning'
                , 'message': 'There were %s duplicate entries in the'
                    ' uploaded file.' % dupe_count
            }
            upload.update(push__worker_messages = message)
    
        # define a default schema for the project
        ordered_schema = []
        # try to guess the data type for each header
        # take the first entry and base guesses off of this data
        entry = Entry.objects(upload=upload)[0]
        for header in upload_header_names:
            value = utilities.smart_cast(entry.values[header])

            # compare result of casting to other types
            if type(value) == datetime.datetime:
                data_type = 'datetime'
            elif type(value) == float:
                data_type = 'number'
            else:
                data_type = 'string'

            new_header = Header(
                data_type = data_type
                , display = True
                , name = header
                , project = project
                , label = header
            )
            new_header.save()
            ordered_schema.append(new_header)
        
        # save results
        project.update(set__ordered_schema = ordered_schema)
        message = {
            'status': 'warning'
            , 'message': "Please verify the data types in this project's "
                "schema."
        }
        upload.update(push__worker_messages = message)
    
    # send the uploaded file to s3
    connection = boto.connect_s3(
        aws_access_key_id = app.config['AWS']['access_key_id']
        , aws_secret_access_key = app.config['AWS']['secret_access_key'])
    
    bucket_name = '%s-%s' % (app.config['UPLOADS_BUCKET']
        , app.config['AWS']['access_key_id'])
    bucket = connection.create_bucket(bucket_name.lower())

    s3_key = S3_Key(bucket)
    s3_key.key = '%s-%s' % (filename, utilities.generate_random_string(6))
    s3_key.set_contents_from_filename(absolute_filename)

    # save the data
    upload.update(set__extension = filename.rsplit('.', 1)[1])
    upload.update(set__s3_key = s3_key.key)
    upload.update(set__upload_time = datetime.datetime.utcnow())

    upload.update(set__headers = existing_header_names)

    # remove the uploaded file from the local temp location
    os.unlink(absolute_filename)
Exemplo n.º 2
0
def filters(org_label, project_label, filter_label):
    ''' creating filters for various purposes
    /organizations/aquaya/projects/water-quality/filters?create=true
        : create a new filter config, immediately redirect to editing
    /organizations/aquaya/projects/water-quality/filters/big-cities
        : view a filter
    /organizations/aquaya/projects/water-quality/filters/big-cities?edit=true
        : edit a filter; accepts GET or POST
    '''
    user = User.objects(email=session['email'])[0]
    
    orgs = Organization.objects(label=org_label)
    if not orgs:
        flash('Organization "%s" not found, sorry!' % org_label, 'warning')
        return redirect(url_for('organizations'))
    org = orgs[0]

    # permission-check
    if org not in user.organizations and not user.admin_rights:
        app.logger.error('%s tried to view a project but was \
            denied for want of admin rights' % session['email'])
        abort(404)
    
    # find the project
    projects = Project.objects(label=project_label, organization=org) 
    if not projects:
        flash('Project "%s" not found, sorry!' % project_label, 'warning')
        return redirect(url_for('organizations', org_label=org.label))
    project = projects[0]

    if request.method == 'POST':
        # we have a filter_label
        filters= Filter.objects(label=filter_label, project=project)
        if not filters:
            abort(404)
        # should try to avoid overriding the builtin..
        filter = filters[0]

        form_type = request.form.get('form_type', '')
        if form_type == 'info':
            if filter.name != request.form.get('name', ''):
                name = request.form.get('name', '')
                filter.name = name

                filters = Filter.objects(project=project).only('label')
                labels = [f.label for f in filters]
                filter.label = utilities.generate_label(name, labels)

            filter.description = request.form.get('description', '')
            filter.comparison = request.form.get('comparison', '')
            
            # header of the format header_id__4abcd0001
            header_id = request.form.get('header', '')
            header_id = header_id.split('header_id__')[1]
            header = Header.objects(id=header_id)[0]
            filter.header = header

            if header.data_type == 'datetime':
                # check if relative or absolute datetime comparison
                if filter.comparison in \
                    constants.filter_comparisons['datetime_absolute']:
                    compare_to = '%s %s' % (
                        request.form.get('date_compare_to', '')
                        , request.form.get('time_compare_to', ''))
                    try:
                        dt = datetime.datetime.strptime(
                            compare_to, '%B %d, %Y %I:%M %p')
                        filter.compare_to = {'value': dt}
                    except:
                        flash("didn't understand that date formatting; make \
                            sure it looks like 'June 21, 2012' and \
                            '02:45 PM'", 'error')
                        return redirect(url_for('filters', org_label=org.label
                            , project_label=project.label
                            , filter_label=filter.label
                            , edit='true'))
                elif filter.comparison in \
                    constants.filter_comparisons['datetime_relative']:

                    inputs = [request.form.get('relative_years', '')
                            , request.form.get('relative_months', '')
                            , request.form.get('relative_weeks', '')
                            , request.form.get('relative_days', '')
                            , request.form.get('relative_hours', '')
                            , request.form.get('relative_minutes', '')]

                    parsed_inputs = []
                    for i in inputs:
                        if i:
                            parsed_inputs.append(i)
                        else:
                            parsed_inputs.append(0)

                    # verify that we have numbers
                    try:
                        values = [int(i) for i in parsed_inputs]
                    except:
                        flash('input times must be whole numbers', 'error')
                        return redirect(url_for('filters', org_label=org.label
                            , project_label=project.label
                            , filter_label=filter.label
                            , edit='true'))

                    filter.compare_to = {'value': values}

                else:
                    app.logger.info('unknown comparison "%s" for project %s' \
                        % (comparison, project.name))
                    abort(404)

            else:
                # not datetime; cast the value we're comparing against
                compare_to = request.form.get('string_compare_to', '')
                filter.compare_to = {'value': utilities.smart_cast(compare_to)}

        elif form_type == 'admin':
            # delete the filter
            name = filter.name

            # pull filters out of the statistics
            statistics = Statistic.objects(filters=filter)
            for statistic in statistics:
                statistic.update(pull__filters=filter)

            # pull filters out of the graphs
            graphs = Graph.objects(filters=filter)
            for graph in graphs:
                graph.update(pull__filters=filter)

            filter.delete()
            app.logger.info('%s deleted filter "%s"' % \
                (session['email'], name))
            flash('filter "%s" was deleted successfully' % name, 'success')
            return redirect(url_for('filters', org_label=org.label
                , project_label=project.label))
        
        else:
            # bad 'form_type'
            abort(404)
       
        try:
            filter.save()
            flash('changes saved successfully', 'success')
            return redirect(url_for('filters', org_label=org.label
                , project_label=project.label, filter_label=filter.label))
        except:
            app.logger.error('%s experienced an error saving info about %s' % (
                session['email'], request.form['name']))
            flash('Error saving changes -- make sure filter names are unique.'
                , 'error')
            return redirect(url_for('filters', org_label=org.label
                , project_label=project.label, filter_label=filter_label
                , edit='true'))
        
    
    if request.method == 'GET':
        if filter_label:
            filters = Filter.objects(label=filter_label, project=project)
            if not filters:
                app.logger.error('%s tried to access a filter that does not \
                    exist' % session['email'])
                flash('Filter "%s" not found, sorry!' % filter_label
                    , 'warning')
                return redirect(url_for('projects', org_label=org.label
                    , project_label=project.label))
            filter = filters[0]

            if request.args.get('edit', '') == 'true':
                # valid comparisons
                comparisons = json.dumps(constants.filter_comparisons)

                # list of allowed absolute datetime comparisons
                absolute_comparisons = \
                    constants.filter_comparisons['datetime_absolute']

                relative_values = None
                if filter.comparison in \
                    constants.filter_comparisons['datetime_relative']:
                    relative_values = filter.compare_to['value']

                return render_template('filter_edit.html', filter=filter
                    , comparisons=comparisons
                    , absolute_datetime_comparisons=absolute_comparisons
                    , relative_datetime_compare_values=relative_values)
            else:
                # apply the filter; start with some defaults
                conditions = {
                    'project': project
                    , 'unique': True
                    , 'visible': True
                }
                entries = Entry.objects(**conditions)
                project_count = len(entries)
                
                if not filter.comparison:
                    return render_template('filter.html', filter=filter
                        , project_count=project_count)
                
                # find all matches for this filter to display
                matches = utilities.apply_filter(filter, entries)

                # sort the matches
                matches = utilities.sort_entries(project, matches)
                
                # pagination
                entries_per_page = 10
                pages = utilities.calculate_pages(len(matches)
                    , entries_per_page=entries_per_page)

                # validate the requested page
                current_page = utilities.validate_page_request(
                    request.args.get('page', 1), pages)
                
                # manually paginate
                start_index = entries_per_page * (current_page - 1)
                end_index = start_index + entries_per_page
                paginated_matches = matches[start_index:end_index]

                # list of allowed absolute datetime comparisons
                absolute_comparisons = \
                    constants.filter_comparisons['datetime_absolute']
                
                # if the filter is for a relative datetime, parse the value
                relative_value = None
                if filter.comparison in \
                    constants.filter_comparisons['datetime_relative']:
                    
                    # filter.compare_to is a list [year, month, etc..]
                    # first non zero value will be used
                    for i in range(len(filter.compare_to['value'])):
                        if filter.compare_to['value'][i] != 0:
                            break
                    # values in the compare_to list
                    periods = ['year', 'month', 'week', 'day', 'hour'
                        , 'minute']
                    # see if we need to make it plural
                    suffix = ''
                    if filter.compare_to['value'][i] > 1:
                        suffix = 's'
                    
                    relative_value = '%s %s%s' \
                        % (filter.compare_to['value'][i], periods[i], suffix)

                return render_template('filter.html', filter=filter
                    , entries = paginated_matches
                    , total_matches = len(matches)
                    , project_count=project_count
                    , current_page = current_page
                    , number_of_pages = pages
                    , absolute_datetime_comparisons=absolute_comparisons
                    , relative_datetime_value=relative_value)

        if request.args.get('create', '') == 'true':
            # create a new filter 

            # CSRF validation
            token = request.args.get('token', '')
            if not verify_token(token):
                abort(403)

            try:
                filter_name = 'filter-%s' % utilities.generate_random_string(6)
                new_filter= Filter(
                    creation_time = datetime.datetime.utcnow()
                    , creator = user
                    , label = filter_name.lower()
                    , project = project
                    , name = filter_name
                )
                new_filter.save() 
                app.logger.info('filter created by %s' % session['email'])
                flash('filter created; please change the defaults', 'success')
            except:
                app.logger.error('filter creation failed for %s' % \
                    session['email'])
                flash('There was an error, sorry :/', 'error')
                return redirect(url_for('projects', org_label=org.label
                    , project=project.label))
            
            # redirect to the editing screen
            return redirect(url_for('filters', org_label=org.label
                , project_label=project.label, filter_label=new_filter.label
                , edit='true'))
        
        # no filter in particular was specified, show 'em all
        filters = Filter.objects(project=project)
        
        # list of allowed absolute datetime comparisons
        absolute_comparisons = \
            constants.filter_comparisons['datetime_absolute']
        
        relative_values = {}
        for filter in filters:
            # if the filter is for a relative datetime, parse the value
            relative_value = None
            if filter.comparison in \
                constants.filter_comparisons['datetime_relative']:
                
                # filter.compare_to is a list [year, month, etc..]
                # first non zero value will be used
                for i in range(len(filter.compare_to['value'])):
                    if filter.compare_to['value'][i] != 0:
                        break
                # values in the compare_to list
                periods = ['year', 'month', 'week', 'day', 'hour'
                    , 'minute']
                # see if we need to make it plural
                suffix = ''
                if filter.compare_to['value'][i] > 1:
                    suffix = 's'
                
                relative_value = '%s %s%s' % (
                    filter.compare_to['value'][i], periods[i], suffix)

            relative_values[filter.name] = relative_value

        return render_template('project_filters.html', project=project
            , filters=filters
            , absolute_datetime_comparisons=absolute_comparisons
            , relative_datetime_values=relative_values)
Exemplo n.º 3
0
def connect_to_source(connection_id):
    ''' connects to commcare and pulls new data
    need to handle auth failures
    '''

    connections = Connection.objects(id=connection_id)
    if not connections:
        # the connection was deleted while the job waited
        return False
    connection = connections[0]

    project = connection.project
    
    # validate the schedule config
    if not ready_to_connect(connection):
        print 'bad connection config for "%s"' % connection.name
        return False
    
    # create a new log entry
    new_log_entry = ConnectionLogEntry(
        initiation_time = datetime.datetime.utcnow()
        , project = project
        , service_type = connection.service_type)
    new_log_entry.save()

    # gather data from the service
    # assumed to be commcare at the moment
    url = 'https://www.commcarehq.org/a/%s/reports/export/' % connection.domain
    payload = {
        'export_tag': '"%s"' % connection.export_tag
        , 'format': 'json'
    }
    if connection.include_admin:
        payload['ufilter'] = '2'
    
    # include a token in the next request and commcare will only send updates
    if connection.latest_export_token:
        payload['previous_export'] = connection.latest_export_token

    credentials = HTTPDigestAuth(connection.credentials['username']
        , connection.credentials['password'])
    
    # start the actual request
    r = requests.get(url, auth=credentials, params=payload)
    
    # if there is no new data, r.json will be None
    if r.status_code == 200 and r.json:
        data = r.json['#']
        headers = data['headers']

        # convert invalid characters in headers
        converted_headers = []
        for header in headers:
            header = header.replace('.', '__')
            header = header.replace('$', '__')
            converted_headers.append(header)
        
        if converted_headers != headers:
            print 'Invalid characters such as "$" and "." were converted to \
                "__".'
        
        # carry on with the converted headers
        headers = converted_headers[:]
            
        project_headers = [header.name for header in project.ordered_schema]
        
        # check if schema of response is subset of project schema
        # suppose project.ordered_schema is [1,2,3]
        # an upload with schema [1,2] is accepted
        # .. [5,6,7] is rejected
        # .. [1,2,3,4] is accepted with a warning that data from '4' will be \
        # discarded
        if project.ordered_schema:
            if not connection.include_metadata:
                # response headers that are metadata will be disregarded
                scrubbed_headers = []
                for header in headers:
                    # returns None if header should be discarded
                    header = _clean_commcare_metadata_header(header)
                    if header:
                        scrubbed_headers.append(header)
            else:
                scrubbed_headers = headers

            # headers
            discarded_headers = [h for h in scrubbed_headers \
                if h not in project_headers]

            if len(scrubbed_headers) == len(discarded_headers):
                # all of the headers are going to be discarded
                # reject this response
                print 'Data that was received does not match the existing \
                    project schema and cannot be saved.'
                # abort
                return False

            if discarded_headers:
                # some headers will be ignored
                discards = ', '.join('"%s"' % h for h in discarded_headers)
                print 'The data in this response has some column headings \
                    that are not currently in the project.  Data for these \
                    headers will not be added to the project: %s.' % discards
        else:
            # no project.ordered_schema
            discarded_headers = []

        # capture entries
        for row in data['rows']:
            values = {}
            # each row is just a list
            for i, value in enumerate(row):
                header = headers[i]
                
                # check to see if we should include metadata
                if not connection.include_metadata:
                    # drop all headers that are commcare metadata
                    header = _clean_commcare_metadata_header(header)
                    # returns None if header is metadata
                    if not header:
                        continue

                # ignore headers that were not in the ordered_schema
                if header in discarded_headers:
                    continue
                
                # attempt to convert values to a more sane type
                values[header] = utilities.smart_cast(value)

            if values:
                new_entry = Entry(
                    project = project
                    , connection_log = new_log_entry
                    , values = values
                )
                new_entry.save()
        
        # for the log, track how many entries we've received
        records_received = len(data['rows'])
       
        # prepare for de-duplication
        # get set of headers from the keys of the last 'values'
        sorted_headers = values.keys()
        sorted_headers.sort()
        # create a hash of the entry values for later de-duping
        for entry in Entry.objects(connection_log=new_log_entry):
            m = hashlib.md5()

            for header in sorted_headers: 
                value = entry.values[header]
                if type(value) == type(u'unicode'):
                    m.update(value.encode('utf8'))
                else:
                    m.update(str(value))
            entry.update(set__value_hash = m.hexdigest())
       

        if project.ordered_schema:
            # data has been uploaded before
            # so check this submission for duplicates
            uniques = Entry.objects(project=project, unique=True)
            unique_hashes = [u['value_hash'] for u in uniques]
            
            # also see if any newly-uploaded entries should be hidden
            hidden_entries = Entry.objects(project=project
                , visible=False)
            hidden_hashes = [h['value_hash'] for h in hidden_entries]

            connection_entries = Entry.objects(connection_log=new_log_entry)
            for entry in connection_entries:
                if entry['value_hash'] not in unique_hashes:
                    entry.update(set__unique = True)
                else:
                    entry.update(set__unique = False)
                
                if entry['value_hash'] in hidden_hashes:
                    entry.update(set__visible = False)

        else:
            # no project schema has been defined
            # so this is the first dataset
            for entry in Entry.objects(connection_log=new_log_entry):
                entry.update(set__unique = True)
        
            # ('headers' at this stage is not metadata-less
            # it has only had invalid chars removed)
            scrubbed_headers = []
            for header in headers:
                # check to see if we should include metadata headers
                if not connection.include_metadata:
                    # drop all headers that are commcare metadata
                    header = _clean_commcare_metadata_header(header)
                    # returns None if header is metadata
                    if not header:
                        continue
                scrubbed_headers.append(header)
            # continue with this set
            headers = scrubbed_headers[:]

            # define a default schema for the project
            ordered_schema = []
            # try to guess the data type for each header
            # take the first entry and base guesses off of this data
            entry = Entry.objects(connection_log=new_log_entry)[0]
            for header in headers:
                # attempt to cast the value
                value = utilities.smart_cast(entry.values[header])

                # compare result of casting to other types
                if type(value) == type(datetime.datetime.utcnow()):
                    data_type = 'datetime'
                elif type(value) == type(float(2)):
                    data_type = 'number'
                else:
                    data_type = 'string'

                new_header = Header(
                    data_type = data_type
                    , display = True
                    , name = header
                    , project = project
                    , label = header
                )
                new_header.save()
                ordered_schema.append(new_header)

            # save results
            project.update(set__ordered_schema = ordered_schema)

        
        # save the response export token for next time
        # allows us to only get updates
        if 'x-commcarehq-export-token' in r.headers.keys():
            connection.update(set__latest_export_token = \
                r.headers['x-commcarehq-export-token'])

    elif r.json == None:
        # token included in request and there are no updates
        # so no json is sent
        records_received = 0

    # update the log entry
    new_log_entry.update(set__http_status_code = str(r.status_code))
    new_log_entry.update(set__records_received = records_received)
    new_log_entry.update(set__completion_time = datetime.datetime.utcnow())

    # bump the schedule's last run time
    repeating_task = connection.schedule
    repeating_task.update(set__last_run_time = datetime.datetime.utcnow())
    
    # and schedule the next job
    update_scheduled_connection(connection)