Пример #1
0
class ExportConfiguration(DocumentSchema):
    """
    Just a way to configure a single export. Used in the group export config.
    """
    index = JsonProperty()
    name = StringProperty()
    format = StringProperty()

    @property
    def filename(self):
        return "%s.%s" % (self.name, Format.from_format(self.format).extension)

    @property
    def type(self):
        # hack - make this backwards compatible with form/case categorization
        # these might only exist in the care-bihar domain or wherever else
        # they've been manually created in the DB.
        try:
            return 'form' if 'http:' in self.index[1] else 'case'
        except IndexError:
            # arbitrarily choose default so it doesn't stay hidden from the UI forever.
            return 'form'

    def __repr__(self):
        return ('%s (%s)' % (self.name, self.index)).encode('utf-8')
Пример #2
0
class SavedExportSchema(BaseSavedExportSchema, UnicodeMixIn):
    """
    Lets you save an export format with a schema and list of columns
    and display names.
    """

    name = StringProperty()
    default_format = StringProperty()

    is_safe = BooleanProperty(default=False)  # Is the export de-identified?
    # self.index should always match self.schema.index
    # needs to be here so we can use in couch views
    index = JsonProperty()

    # id of an ExportSchema for checkpointed schemas
    schema_id = StringProperty()

    # user-defined table configuration
    tables = SchemaListProperty(ExportTable)

    # For us right now, 'form' or 'case'
    type = StringProperty()

    # ID of  the new style export that it was converted to
    converted_saved_export_id = StringProperty()

    def __unicode__(self):
        return "%s (%s)" % (self.name, self.index)

    def transform(self, doc):
        return doc

    @property
    def global_transform_function(self):
        # will be called on every value in the doc during export
        return identity

    @property
    @memoized
    def schema(self):
        return ExportSchema.get(self.schema_id)

    @property
    def table_name(self):
        return self.sheet_name if self.sheet_name else "%s" % self._id

    @classmethod
    def default(cls, schema, name="", type='form'):
        return cls(name=name,
                   index=schema.index,
                   schema_id=schema.get_id,
                   tables=[ExportTable.default(schema.tables[0][0])],
                   type=type)

    @property
    @memoized
    def tables_by_index(self):
        return dict([t.index, t] for t in self.tables)

    def get_table_configuration(self, index):
        def column_configuration():
            columns = self.schema.get_columns(index)
            if index in self.tables_by_index:
                return list(
                    self.tables_by_index[index].get_column_configuration(
                        columns))
            else:
                return [
                    ExportColumn(index=c,
                                 display='').to_config_format(selected=False)
                    for c in columns
                ]

        def display():
            if index in self.tables_by_index:
                return self.tables_by_index[index].display
            else:
                return ''

        return {
            "index": index,
            "display": display(),
            "column_configuration": column_configuration(),
            "selected": index in self.tables_by_index
        }

    def get_table_headers(self, override_name=False):
        return ((self.table_name if override_name and i == 0 else t.index,
                 [t.get_headers_row()]) for i, t in enumerate(self.tables))

    @property
    def table_configuration(self):
        return [
            self.get_table_configuration(index)
            for index, cols in self.schema.tables
        ]

    def update_schema(self):
        """
        Update the schema for this object to include the latest columns from
        any relevant docs.

        Does NOT save the doc, just updates the in-memory object.
        """
        from couchexport.schema import build_latest_schema
        schema = build_latest_schema(self.index)
        if schema:
            self.set_schema(schema)

    def set_schema(self, schema):
        """
        Set the schema for this object.

        Does NOT save the doc, just updates the in-memory object.
        """
        self.schema_id = schema.get_id

    def trim(self, document_table, doc, apply_transforms=True):
        tables = []
        for table_index, data in document_table:
            if table_index in self.tables_by_index:
                # todo: currently (index, rows) instead of (display, rows); where best to convert to display?
                tables.append(
                    (table_index, self.tables_by_index[table_index].trim(
                        data, doc, apply_transforms,
                        self.global_transform_function)))
        return tables

    def get_export_components(self, previous_export_id=None, filter=None):
        from couchexport.export import ExportConfiguration

        database = get_db()

        config = ExportConfiguration(database, self.index, previous_export_id,
                                     self.filter & filter)

        # get and checkpoint the latest schema
        updated_schema = config.get_latest_schema()
        export_schema_checkpoint = config.create_new_checkpoint()
        return config, updated_schema, export_schema_checkpoint

    def get_export_files(self,
                         format=None,
                         previous_export=None,
                         filter=None,
                         process=None,
                         max_column_size=None,
                         apply_transforms=True,
                         limit=0,
                         **kwargs):
        from couchexport.export import get_writer, get_formatted_rows
        if not format:
            format = self.default_format or Format.XLS_2007

        config, updated_schema, export_schema_checkpoint = self.get_export_components(
            previous_export, filter)

        # transform docs onto output and save
        writer = get_writer(format)

        # open the doc and the headers
        formatted_headers = list(self.get_table_headers())
        fd, path = tempfile.mkstemp()
        with os.fdopen(fd, 'wb') as tmp:
            writer.open(formatted_headers,
                        tmp,
                        max_column_size=max_column_size,
                        table_titles=dict([(table.index, table.display)
                                           for table in self.tables
                                           if table.display]))

            total_docs = len(config.potentially_relevant_ids)
            if process:
                DownloadBase.set_progress(process, 0, total_docs)
            for i, doc in config.enum_docs():
                if limit and i > limit:
                    break
                if self.transform and apply_transforms:
                    doc = self.transform(doc)
                formatted_tables = self.trim(get_formatted_rows(doc,
                                                                updated_schema,
                                                                separator="."),
                                             doc,
                                             apply_transforms=apply_transforms)
                writer.write(formatted_tables)
                if process:
                    DownloadBase.set_progress(process, i + 1, total_docs)

            writer.close()

        if format == Format.PYTHON_DICT:
            return writer.get_preview()

        return ExportFiles(path, export_schema_checkpoint, format)

    def get_preview_data(self, export_filter, limit=50):
        return self.get_export_files(Format.PYTHON_DICT,
                                     None,
                                     export_filter,
                                     limit=limit)

    def download_data(self,
                      format="",
                      previous_export=None,
                      filter=None,
                      limit=0):
        """
        If there is data, return an HTTPResponse with the appropriate data.
        If there is not data returns None.
        """
        from couchexport.shortcuts import export_response
        files = self.get_export_files(format,
                                      previous_export,
                                      filter,
                                      limit=limit)
        return export_response(files.file, files.format, self.name)

    def to_export_config(self):
        """
        Return an ExportConfiguration object that represents this.
        """
        # confusingly, the index isn't the actual index property,
        # but is the index appended with the id to this document.
        # this is to avoid conflicts among multiple exports
        index = "%s-%s" % (self.index, self._id) if isinstance(self.index, six.string_types) else \
            self.index + [self._id] # self.index required to be a string or list
        return ExportConfiguration(index=index,
                                   name=self.name,
                                   format=self.default_format)

    def custom_validate(self):
        if self.default_format == Format.XLS:
            for table in self.tables:
                if len(table.columns) > 255:
                    raise CustomExportValidationError(
                        "XLS files can only have 255 columns")

    # replaces `sheet_name = StringProperty()`
    def __get_sheet_name(self):
        return self.tables[0].display

    def __set_sheet_name(self, value):
        self.tables[0].display = value

    sheet_name = property(__get_sheet_name, __set_sheet_name)

    @classmethod
    def wrap(cls, data):
        # since this is a property now, trying to wrap it will fail hard
        if 'sheet_name' in data:
            del data['sheet_name']
        return super(SavedExportSchema, cls).wrap(data)
Пример #3
0
class ExportSchema(Document, UnicodeMixIn):
    """
    An export schema that can store intermittent contents of the export so
    that the entire doc list doesn't have to be used to generate the export
    """
    index = JsonProperty()
    schema = DictProperty()
    timestamp = TimeStampProperty()

    def __unicode__(self):
        return "%s: %s" % (json.dumps(self.index), self.timestamp)

    @classmethod
    def wrap(cls, data):
        if data.get('timestamp', '').startswith('1-01-01'):
            data['timestamp'] = '1970-01-01T00:00:00Z'

        return super(ExportSchema, cls).wrap(data)

    @classmethod
    def last(cls, index):
        return cls.view(
            "couchexport/schema_checkpoints",
            startkey=[json.dumps(index), {}],
            endkey=[json.dumps(index)],
            descending=True,
            limit=1,
            include_docs=True,
            reduce=False,
        ).one()

    @classmethod
    def get_all_checkpoints(cls, index):
        doc_ids = [
            result["id"] for result in cls.get_db().view(
                "couchexport/schema_checkpoints",
                startkey=[json.dumps(index)],
                endkey=[json.dumps(index), {}],
                reduce=False,
            )
        ]
        for doc in iter_docs(cls.get_db(), doc_ids):
            yield cls.wrap(doc)

    _tables = None

    @property
    def tables(self):
        if self._tables is None:
            from couchexport.export import get_headers
            headers = get_headers(self.schema, separator=".")
            self._tables = [(index, row[0]) for index, row in headers]
        return self._tables

    @property
    def table_dict(self):
        return dict(self.tables)

    def get_columns(self, index):
        return ['id'] + self.table_dict[index].data

    def get_all_ids(self, database=None):
        database = database or self.get_db()
        return set([
            result['id'] for result in database.view(
                "couchexport/schema_index",
                reduce=False,
                **get_schema_index_view_keys(self.index)).all()
        ])

    def get_new_ids(self, database=None):
        database = database or self.get_db()
        assert self.timestamp, 'exports without timestamps are no longer supported.'
        tag_as_list = force_tag_to_list(self.index)
        startkey = tag_as_list + [self.timestamp.isoformat()]
        endkey = tag_as_list + [{}]
        return set([
            result['id']
            for result in database.view("couchexport/schema_index",
                                        reduce=False,
                                        startkey=startkey,
                                        endkey=endkey)
        ])

    def get_new_docs(self, database=None):
        return iter_docs(self.get_new_ids(database))
Пример #4
0
class DefaultExportSchema(BaseSavedExportSchema):
    index = JsonProperty()
    type = StringProperty()

    @property
    def name(self):
        return self.index

    @property
    def indices(self):
        return [self.index]

    def parse_headers(self, headers):
        first_header = headers[0][1]
        return [(self.table_name, first_header)]

    def remap_tables(self, tables):
        # can be overridden to rename/remove default stuff from exports
        return tables

    def get_export_components(self, previous_export_id=None, filter=None):
        from couchexport.export import get_export_components
        return get_export_components(self.index,
                                     previous_export_id,
                                     filter=self.filter & filter)

    def get_export_files(self,
                         format='',
                         previous_export_id=None,
                         filter=None,
                         use_cache=True,
                         max_column_size=2000,
                         separator='|',
                         process=None,
                         **kwargs):
        # the APIs of how these methods are broken down suck, but at least
        # it's DRY
        from couchexport.export import get_writer, get_export_components, get_headers, get_formatted_rows
        from django.core.cache import cache
        import hashlib

        export_tag = self.index

        CACHE_TIME = 1 * 60 * 60  # cache for 1 hour, in seconds

        def _build_cache_key(tag, prev_export_id, format, max_column_size):
            def _human_readable_key(tag, prev_export_id, format,
                                    max_column_size):
                return "couchexport_:%s:%s:%s:%s" % (tag, prev_export_id,
                                                     format, max_column_size)

            return hashlib.md5(
                _human_readable_key(
                    tag, prev_export_id, format,
                    max_column_size).encode('utf-8')).hexdigest()

        # check cache, only supported for filterless queries, currently
        cache_key = _build_cache_key(export_tag, previous_export_id, format,
                                     max_column_size)
        if use_cache and filter is None:
            cached_data = cache.get(cache_key)
            if cached_data:
                (tmp, checkpoint) = cached_data
                return ExportFiles(tmp, checkpoint)

        fd, path = tempfile.mkstemp()
        with os.fdopen(fd, 'wb') as tmp:
            schema_index = export_tag
            config, updated_schema, export_schema_checkpoint = get_export_components(
                schema_index, previous_export_id, filter)
            if config:
                writer = get_writer(format)

                # get cleaned up headers
                formatted_headers = self.remap_tables(
                    get_headers(updated_schema, separator=separator))
                writer.open(formatted_headers,
                            tmp,
                            max_column_size=max_column_size)

                total_docs = len(config.potentially_relevant_ids)
                if process:
                    DownloadBase.set_progress(process, 0, total_docs)
                for i, doc in config.enum_docs():
                    if self.transform:
                        doc = self.transform(doc)

                    writer.write(
                        self.remap_tables(
                            get_formatted_rows(doc,
                                               updated_schema,
                                               include_headers=False,
                                               separator=separator)))
                    if process:
                        DownloadBase.set_progress(process, i + 1, total_docs)
                writer.close()

            checkpoint = export_schema_checkpoint

        if checkpoint:
            if use_cache:
                cache.set(cache_key, (path, checkpoint), CACHE_TIME)
            return ExportFiles(path, checkpoint)

        return None
Пример #5
0
class ExportSchema(Document, UnicodeMixIn):
    """
    An export schema that can store intermittent contents of the export so
    that the entire doc list doesn't have to be used to generate the export
    """
    index = JsonProperty()
    seq = StringProperty()  # semi-deprecated
    schema = DictProperty()
    timestamp = TimeStampProperty()

    def __unicode__(self):
        return "%s: %s" % (json.dumps(self.index), self.seq)

    @property
    def is_bigcouch(self):
        try:
            int(self.seq)
            return False
        except ValueError:
            return True

    @classmethod
    def wrap(cls, data):
        if isinstance(data.get('seq'), (int, long)):
            data['seq'] = unicode(data['seq'])
        ret = super(ExportSchema, cls).wrap(data)
        if not ret.timestamp:
            # these won't work on bigcouch so we want to know if this happens
            notify_exception(
                None, 'an export without a timestamp was accessed! %s (%s)' %
                (ret.index, ret._id))
            # this isn't the cleanest nor is it perfect but in the event
            # this doc traversed databases somehow and now has a bad seq
            # id, make sure to just reset it to 0.
            # This won't catch if the seq is bad but not greater than the
            # current one).
            current_seq = cls.get_db().info()["update_seq"]
            try:
                if int(current_seq) < int(ret.seq):
                    ret.seq = "0"
                    ret.save()
            except ValueError:
                # seqs likely weren't ints (e.g. bigcouch)
                # this should never be possible (anything on bigcouch should
                # have a timestamp) so let's fail hard
                raise Exception(
                    'export %s is in a bad state (no timestamp or integer seq)'
                    % ret._id)
        # TODO? handle seq -> datetime migration
        return ret

    @classmethod
    def last(cls, index):
        # search first by timestamp, then fall back to seq id
        shared_kwargs = {
            'descending': True,
            'limit': 1,
            'include_docs': True,
            'reduce': False,
        }
        ret = cls.view("couchexport/schema_checkpoints",
                       startkey=['by_timestamp',
                                 json.dumps(index), {}],
                       endkey=['by_timestamp',
                               json.dumps(index)],
                       **shared_kwargs).one()
        if ret and not ret.timestamp:
            # we found a bunch of old checkpoints but they only
            # had seq ids, so use those instead
            ret = cls.view("couchexport/schema_checkpoints",
                           startkey=['by_seq', json.dumps(index), {}],
                           endkey=['by_seq', json.dumps(index)],
                           **shared_kwargs).one()
        return ret

    @classmethod
    def get_all_indices(cls):
        ret = cls.get_db().view("couchexport/schema_checkpoints",
                                startkey=['by_timestamp'],
                                endkey=['by_timestamp', {}],
                                reduce=True,
                                group=True,
                                group_level=2)
        for row in ret:
            index = row['key'][1]
            try:
                yield json.loads(index)
            except ValueError:
                # ignore this for now - should just be garbage data
                # print "poorly formatted index key %s" % index
                pass

    @classmethod
    def get_all_checkpoints(cls, index):
        return cls.view("couchexport/schema_checkpoints",
                        startkey=['by_timestamp',
                                  json.dumps(index)],
                        endkey=['by_timestamp',
                                json.dumps(index), {}],
                        include_docs=True,
                        reduce=False)

    _tables = None

    @property
    def tables(self):
        if self._tables is None:
            from couchexport.export import get_headers
            headers = get_headers(self.schema, separator=".")
            self._tables = [(index, row[0]) for index, row in headers]
        return self._tables

    @property
    def table_dict(self):
        return dict(self.tables)

    def get_columns(self, index):
        return ['id'] + self.table_dict[index].data

    def get_all_ids(self, database=None):
        database = database or self.get_db()
        return set([
            result['id'] for result in database.view(
                "couchexport/schema_index",
                reduce=False,
                **get_schema_index_view_keys(self.index)).all()
        ])

    def get_new_ids(self, database=None):
        # TODO: deprecate/remove old way of doing this
        database = database or self.get_db()
        if self.timestamp:
            return self._ids_by_timestamp(database)
        else:
            return self._ids_by_seq(database)

    def _ids_by_seq(self, database):
        if self.seq == "0" or self.seq is None:
            return self.get_all_ids()

        consumer = Consumer(database)
        view_results = consumer.fetch(since=self.seq)
        if view_results:
            include_ids = set([res["id"] for res in view_results["results"]])
            return include_ids.intersection(self.get_all_ids())
        else:
            # sometimes this comes back empty. I think it might be a bug
            # in couchdbkit, but it's impossible to consistently reproduce.
            # For now, just assume this is fine.
            return set()

    def _ids_by_timestamp(self, database):
        tag_as_list = force_tag_to_list(self.index)
        startkey = tag_as_list + [self.timestamp.isoformat()]
        endkey = tag_as_list + [{}]
        return set([
            result['id']
            for result in database.view("couchexport/schema_index",
                                        reduce=False,
                                        startkey=startkey,
                                        endkey=endkey)
        ])

    def get_new_docs(self, database=None):
        return iter_docs(self.get_new_ids(database))