def waitForDBChange(self, since=0): getLogger(self).debug( "Watching for changes") while True: last_seq = max(self.getSeqNumber(), since) self.stream = ChangesStream( self.db, feed="continuous", since=last_seq, heartbeat=True) try: for change in self.stream: if not self.changes_callback: return if not change.get('last_seq', None): if change['seq'] > self.getSeqNumber(): self.setSeqNumber(change['seq']) if not change['id'].startswith('_design'): getLogger(self).debug( "Changes from another instance") deleted = bool(change.get('deleted', False)) revision = change.get("changes")[-1].get('rev') obj_id = change.get('id') if not deleted: # update cache doc = self.db.get(obj_id) self.addDoc(doc) self.changes_callback(obj_id, revision, deleted) except Exception as e: getLogger(self).info("Some exception happened while waiting for changes") getLogger(self).info(" The exception was: %s" % e)
def main(argv): """ Main method. This method performs the following tasks: 1. Parse command line arguments 2. Retrieve credentials and connect to Cloudant and WebHDFS 3. Connect to the Cloudant `_changes` feed for checkpointed document consumption 4. Process each change individually. 5. Upon exception throwing, store the latest checkpoint to local file and exit. """ #add options into the parser parser = configureOptions() (options, args) = parser.parse_args() checkRequiredArguments(options, parser) print options # configurations last_seq = options.last_seq #get credential perm_file = '%s/.clou' % os.environ['HOME'] creds = get_creds(perm_file) #connect to source database s = Server('https://%s:%s@%s' % (creds['cloudant_user'], creds['cloudant_pwd'], options.uri)) db = s[options.dbname] #print db.info() #connect to target hdfs cluster hdfs = PyWebHdfsClient(host=options.hdfs_host, port=options.hdfs_port, user_name=creds['hdfs_user']) hdfs.make_dir(options.hdfs_path) #and here we consume the cloudant `_changes` feed counter = 0 changestream = ChangesStream(db, include_docs=True, heartbeat=True, since=last_seq) for c in changestream: #print c try: if counter % 100 == 0: checkpoint(last_seq) seq = processChange(hdfs, c, options.hdfs_path) if seq: # protect against the last line being blank last_seq = seq counter += 1 except Exception: traceback.print_exc() checkpoint(last_seq) os._exit(1) checkpoint(last_seq)
def waitForDBChange(self, db_name, since=0, timeout=15000): """ Be warned this will return after the database has a change, if there was one before call it will return immediatly with the changes done""" changes = [] last_seq = max(self.getLastChangeSeq(db_name), since) db = self._getDb(db_name) with ChangesStream(db, feed="longpoll", since=last_seq, timeout=timeout) as stream: for change in stream: if change['seq'] > self.getLastChangeSeq(db_name): self.setLastChangeSeq(db_name, change['seq']) if not change['id'].startswith('_design'): #fake doc type for deleted objects doc = { 'type': 'unknown', '_deleted': 'False', '_rev': [0] } if not change.get('deleted'): doc = self.getDocument(db_name, change['id']) changes.append(change_factory.create(doc)) if len(changes): getLogger(self).debug("Changes from another instance") return changes
def poll_once(): changes_stream = ChangesStream(db=self._couch_db, since=self._last_processed_seq, include_docs=True, **extra_args) for couch_change in changes_stream: change = change_from_couch_row( couch_change, document_store=self._document_store) populate_change_metadata(change, SOURCE_COUCH, self._couch_db.dbname) yield change self._last_processed_seq = couch_change.get('seq', None)
def run_burst(self): """ Use this for testing pillows. Will run through the changes stream once. """ changes_stream = ChangesStream(db=self.couch_db, since=self.since, filter=self.couch_filter, include_docs=self.include_docs, **self.extra_args) for change in changes_stream: if change: self.processor(change)
def iter_changes(self, since, forever): extra_args = {'feed': 'continuous'} if forever else {} extra_args.update(self._extra_couch_view_params) changes_stream = ChangesStream( db=self._couch_db, heartbeat=True, since=since, filter=self._couch_filter, include_docs=self._include_docs, **extra_args ) for couch_change in changes_stream: yield change_from_couch_row(couch_change, document_store=self._document_store)
def waitForDBChange(self, since=0): """Listen to the stream of changes provided by CouchDbKit. Process these changes accordingly. If there's an exception while listening to the changes, return inmediatly.""" # XXX: the while True found here shouldn't be necessary because # changesStream already keeps listening 'for ever'. In a few tests # I ran, this hypothesis was confirmed, but with our current setup # i'm afraid I may be missing something. In any case, it works # as it is, but this definitevely needs revision. getLogger(self).debug( "Watching for changes") while True: last_seq = max(self.getSeqNumber(), since) self.stream = ChangesStream( self.db, feed="continuous", since=last_seq, heartbeat=True) try: for change in self.stream: if not self.changes_callback: return if not change.get('last_seq', None): if change['seq'] > self.getSeqNumber(): self.setSeqNumber(change['seq']) if not change['id'].startswith('_design'): getLogger(self).debug( "Changes from another instance") deleted = bool(change.get('deleted', False)) revision = change.get("changes")[-1].get('rev') obj_id = change.get('id') if not deleted: # update cache doc = self.db.get(obj_id) self.addDoc(doc) self.changes_callback(obj_id, revision, deleted) except ResourceNotFound as e: getLogger(self).info("The database couldn't be found") self.no_workspace_callback() return False except Exception as e: getLogger(self).info("Some exception happened while waiting for changes") getLogger(self).info(" The exception was: %s" % e) return False # kill thread, it's failed... in reconnection
def waitForDBChange(self, db_name, since=0, timeout=15000): """ Be warned this will return after the database has a change, if there was one before call it will return immediatly with the changes done""" changes = [] last_seq = max(self.getLastChangeSeq(db_name), since) db = self.__getDb(db_name) with ChangesStream(db, feed="longpoll", since=last_seq, timeout=timeout) as stream: for change in stream: if change['seq'] > self.getLastChangeSeq(db_name): changes.append(change) last_seq = reduce(lambda x, y: max(y['seq'], x), changes, self.getLastChangeSeq(db_name)) self.setLastChangeSeq(db_name, last_seq) return changes
def iter_changes(self, since, forever): from corehq.apps.change_feed.data_sources import SOURCE_COUCH extra_args = {'feed': 'continuous'} if forever else {} extra_args.update(self._extra_couch_view_params) self._last_processed_seq = since changes_stream = ChangesStream(db=self._couch_db, heartbeat=True, since=since, filter=self._couch_filter, include_docs=True, **extra_args) for couch_change in changes_stream: change = change_from_couch_row(couch_change, document_store=self._document_store) populate_change_metadata(change, SOURCE_COUCH, self._couch_db.dbname) yield change self._last_processed_seq = couch_change.get('seq', None)
def stream_changes(db, since, limit): for change in ChangesStream(db=db, since=since, limit=limit): yield CouchChange( id=change['id'], rev=change['changes'][0]['rev'], deleted=change.get('deleted', False), seq=change.get('seq'))