class ExportSchema(Document, UnicodeMixIn): """ An export schema that can store intermittent contents of the export so that the entire doc list doesn't have to be used to generate the export """ index = JsonProperty() schema = DictProperty() timestamp = TimeStampProperty() def __unicode__(self): return "%s: %s" % (json.dumps(self.index), self.timestamp) @classmethod def wrap(cls, data): if data.get('timestamp', '').startswith('1-01-01'): data['timestamp'] = '1970-01-01T00:00:00Z' return super(ExportSchema, cls).wrap(data) @classmethod def last(cls, index): return cls.view( "couchexport/schema_checkpoints", startkey=[json.dumps(index), {}], endkey=[json.dumps(index)], descending=True, limit=1, include_docs=True, reduce=False, ).one() @classmethod def get_all_checkpoints(cls, index): doc_ids = [ result["id"] for result in cls.get_db().view( "couchexport/schema_checkpoints", startkey=[json.dumps(index)], endkey=[json.dumps(index), {}], reduce=False, ) ] for doc in iter_docs(cls.get_db(), doc_ids): yield cls.wrap(doc) _tables = None @property def tables(self): if self._tables is None: from couchexport.export import get_headers headers = get_headers(self.schema, separator=".") self._tables = [(index, row[0]) for index, row in headers] return self._tables @property def table_dict(self): return dict(self.tables) def get_columns(self, index): return ['id'] + self.table_dict[index].data def get_all_ids(self, database=None): database = database or self.get_db() return set([ result['id'] for result in database.view( "couchexport/schema_index", reduce=False, **get_schema_index_view_keys(self.index)).all() ]) def get_new_ids(self, database=None): database = database or self.get_db() assert self.timestamp, 'exports without timestamps are no longer supported.' tag_as_list = force_tag_to_list(self.index) startkey = tag_as_list + [self.timestamp.isoformat()] endkey = tag_as_list + [{}] return set([ result['id'] for result in database.view("couchexport/schema_index", reduce=False, startkey=startkey, endkey=endkey) ]) def get_new_docs(self, database=None): return iter_docs(self.get_new_ids(database))
class ExportSchema(Document, UnicodeMixIn): """ An export schema that can store intermittent contents of the export so that the entire doc list doesn't have to be used to generate the export """ index = JsonProperty() seq = StringProperty() # semi-deprecated schema = DictProperty() timestamp = TimeStampProperty() def __unicode__(self): return "%s: %s" % (json.dumps(self.index), self.seq) @property def is_bigcouch(self): try: int(self.seq) return False except ValueError: return True @classmethod def wrap(cls, data): if isinstance(data.get('seq'), (int, long)): data['seq'] = unicode(data['seq']) ret = super(ExportSchema, cls).wrap(data) if not ret.timestamp: # these won't work on bigcouch so we want to know if this happens notify_exception( None, 'an export without a timestamp was accessed! %s (%s)' % (ret.index, ret._id)) # this isn't the cleanest nor is it perfect but in the event # this doc traversed databases somehow and now has a bad seq # id, make sure to just reset it to 0. # This won't catch if the seq is bad but not greater than the # current one). current_seq = cls.get_db().info()["update_seq"] try: if int(current_seq) < int(ret.seq): ret.seq = "0" ret.save() except ValueError: # seqs likely weren't ints (e.g. bigcouch) # this should never be possible (anything on bigcouch should # have a timestamp) so let's fail hard raise Exception( 'export %s is in a bad state (no timestamp or integer seq)' % ret._id) # TODO? handle seq -> datetime migration return ret @classmethod def last(cls, index): # search first by timestamp, then fall back to seq id shared_kwargs = { 'descending': True, 'limit': 1, 'include_docs': True, 'reduce': False, } ret = cls.view("couchexport/schema_checkpoints", startkey=['by_timestamp', json.dumps(index), {}], endkey=['by_timestamp', json.dumps(index)], **shared_kwargs).one() if ret and not ret.timestamp: # we found a bunch of old checkpoints but they only # had seq ids, so use those instead ret = cls.view("couchexport/schema_checkpoints", startkey=['by_seq', json.dumps(index), {}], endkey=['by_seq', json.dumps(index)], **shared_kwargs).one() return ret @classmethod def get_all_indices(cls): ret = cls.get_db().view("couchexport/schema_checkpoints", startkey=['by_timestamp'], endkey=['by_timestamp', {}], reduce=True, group=True, group_level=2) for row in ret: index = row['key'][1] try: yield json.loads(index) except ValueError: # ignore this for now - should just be garbage data # print "poorly formatted index key %s" % index pass @classmethod def get_all_checkpoints(cls, index): return cls.view("couchexport/schema_checkpoints", startkey=['by_timestamp', json.dumps(index)], endkey=['by_timestamp', json.dumps(index), {}], include_docs=True, reduce=False) _tables = None @property def tables(self): if self._tables is None: from couchexport.export import get_headers headers = get_headers(self.schema, separator=".") self._tables = [(index, row[0]) for index, row in headers] return self._tables @property def table_dict(self): return dict(self.tables) def get_columns(self, index): return ['id'] + self.table_dict[index].data def get_all_ids(self, database=None): database = database or self.get_db() return set([ result['id'] for result in database.view( "couchexport/schema_index", reduce=False, **get_schema_index_view_keys(self.index)).all() ]) def get_new_ids(self, database=None): # TODO: deprecate/remove old way of doing this database = database or self.get_db() if self.timestamp: return self._ids_by_timestamp(database) else: return self._ids_by_seq(database) def _ids_by_seq(self, database): if self.seq == "0" or self.seq is None: return self.get_all_ids() consumer = Consumer(database) view_results = consumer.fetch(since=self.seq) if view_results: include_ids = set([res["id"] for res in view_results["results"]]) return include_ids.intersection(self.get_all_ids()) else: # sometimes this comes back empty. I think it might be a bug # in couchdbkit, but it's impossible to consistently reproduce. # For now, just assume this is fine. return set() def _ids_by_timestamp(self, database): tag_as_list = force_tag_to_list(self.index) startkey = tag_as_list + [self.timestamp.isoformat()] endkey = tag_as_list + [{}] return set([ result['id'] for result in database.view("couchexport/schema_index", reduce=False, startkey=startkey, endkey=endkey) ]) def get_new_docs(self, database=None): return iter_docs(self.get_new_ids(database))