def update_from_remote(doc_id, url_remote_couchdb=None, couchdb_remote=None, couchdb_env=None): '''Update the environment's couchdb from a remote couchdb document ''' msg = None if not couchdb_remote: couchdb_remote = get_couchdb(url_remote_couchdb) if not couchdb_env: couchdb_env = get_couchdb() doc = couchdb_remote.get(doc_id) # need to remove the revision data, as will be different del doc['_rev'] # if doc exists, need to update metadata for the existing document # and then save that, due to revision number in couch doc_in_target = couchdb_env.get(doc_id) if doc_in_target: doc_in_target.update(doc) couchdb_env[doc_id] = doc_in_target msg = "updated {}".format(doc_id) else: doc_no_rev = doc.copy() couchdb_env[doc_id] = doc_no_rev msg = "created {}".format(doc_id) print >> sys.stderr, msg return msg
def update_collection_from_remote(url_remote_couchdb, url_api_collection, delete_first=True): '''Update a collection from a remote couchdb. ''' if delete_first: delete_collection(url_api_collection.rsplit('/', 2)[1]) collection = Collection(url_api_collection) # guard against updating production for not ready_for_publication # collections if 'prod' in environ.get('DATA_BRANCH', ''): if not collection.ready_for_publication: raise Exception( 'In PRODUCTION ENV and collection {} not ready for ' 'publication'.format(collection.id)) doc_ids = get_collection_doc_ids(collection.id, url_remote_couchdb) couchdb_remote = get_couchdb(url_remote_couchdb) couchdb_env = get_couchdb() created = 0 updated = 0 for doc_id in doc_ids: msg = update_from_remote(doc_id, couchdb_remote=couchdb_remote, couchdb_env=couchdb_env) if 'created' in msg: created += 1 else: updated += 1 return len(doc_ids), updated, created
def update_collection_from_remote(url_remote_couchdb, url_api_collection, delete_first=True): '''Update a collection from a remote couchdb. ''' if delete_first: delete_collection(url_api_collection.rsplit('/', 2)[1]) collection = Collection(url_api_collection) # guard against updating production for not ready_for_publication # collections if 'prod' in environ.get('DATA_BRANCH', ''): if not collection.ready_for_publication: raise Exception( 'In PRODUCTION ENV and collection {} not ready for ' 'publication'.format(collection.id)) doc_ids = get_collection_doc_ids(collection.id, url_remote_couchdb) couchdb_remote = get_couchdb(url_remote_couchdb) couchdb_env = get_couchdb() created = 0 updated = 0 for doc_id in doc_ids: msg = update_from_remote( doc_id, couchdb_remote=couchdb_remote, couchdb_env=couchdb_env) if 'created' in msg: created += 1 else: updated += 1 return len(doc_ids), updated, created
def main(user_email, cid, url_couchdb_src, field_list, url_couchdb_dest=None): worker = CouchDBWorker() timeout = 100000 cdb_src = get_couchdb(url=url_couchdb_src, username=False, password=False) if url_couchdb_dest: cdb_dest = get_couchdb(url=url_couchdb_dest) else: cdb_dest = get_couchdb() worker.run_by_collection(cid, copy_fields_for_doc, cdb_src, field_list, cdb_dest)
def main(user_email, cid, url_couchdb_src, field_list, url_couchdb_dest=None): worker = CouchDBWorker() timeout = 100000 cdb_src = get_couchdb(url=url_couchdb_src, username=False, password=False) if url_couchdb_dest: cdb_dest= get_couchdb(url=url_couchdb_dest) else: cdb_dest= get_couchdb() worker.run_by_collection(cid, copy_fields_for_doc, cdb_src, field_list, cdb_dest )
def main(cid): worker = CouchDBWorker() enq = CouchDBJobEnqueue() timeout = 100000 cdb = get_couchdb() worker.run_by_collection(cid, delete_field_and_queue_image_harvest, 'object', cdb, enq)
def main(user_email, cid, field_list): worker = CouchDBWorker() timeout = 100000 cdb = get_couchdb() worker.run_by_collection(cid, delete_field_list, field_list, cdb )
def delete_collection(cid): print >> sys.stderr, "DELETING COLLECTION: {}".format(cid) _couchdb = get_couchdb() rows = CouchDBCollectionFilter(collection_key=cid, couchdb_obj=_couchdb) ids = [row['id'] for row in rows] num_deleted, deleted_docs = delete_id_list(ids, _couchdb=_couchdb) publish_to_harvesting( 'Deleted CouchDB Collection {}'.format(cid), 'Deleted {} documents from CouchDB collection {}'.format(num_deleted, cid)) return num_deleted, deleted_docs
def get_collection_doc_ids(collection_id, url_couchdb_source=None): '''Use the by_provider_name view to get doc ids for a given collection ''' _couchdb = get_couchdb(url=url_couchdb_source) v = CouchDBCollectionFilter(couchdb_obj=_couchdb, collection_key=str(collection_id), include_docs=False) doc_ids = [] for r in v: doc_ids.append(r.id) return doc_ids
def main(cid): worker = CouchDBWorker() enq = CouchDBJobEnqueue() timeout = 100000 cdb = get_couchdb() worker.run_by_collection(cid, delete_field_and_queue_image_harvest, 'object', cdb, enq )
def delete_collection(cid): print >> sys.stderr, "DELETING COLLECTION: {}".format(cid) _couchdb = get_couchdb() rows = CouchDBCollectionFilter(collection_key=cid, couchdb_obj=_couchdb) ids = [row['id'] for row in rows] num_deleted, deleted_docs = delete_id_list(ids, _couchdb=_couchdb) subject = format_results_subject(cid, 'Deleted documents from CouchDB {env} ') publish_to_harvesting( subject, 'Deleted {} documents from CouchDB collection CID: {}'.format( num_deleted, cid)) return num_deleted, deleted_docs
def run_on_couchdb_doc(docid, func): '''Run on a doc, by doc id''' _couchdb = get_couchdb() doc = _couchdb[docid] mod_name, func_name = func.rsplit('.', 1) fmod = importlib.import_module(mod_name) ffunc = getattr(fmod, func_name) doc_new = ffunc(doc) if doc_new and doc_new != doc: _couchdb.save(doc_new) return True return False
def harvest_image_for_doc(doc_id, url_couchdb=None, object_auth=None, get_if_object=False): """Wrapper to call from rqworker. Creates ImageHarvester object & then calls harvest_image_for_doc """ harvester = ImageHarvester(url_couchdb=url_couchdb, object_auth=object_auth, get_if_object=get_if_object) # get doc from couchdb couchdb = get_couchdb(url=url_couchdb) doc = couchdb[doc_id] if not get_if_object and "object" in doc: print >>sys.stderr, "Skipping {}, has object field".format(doc["_id"]) else: harvester.harvest_image_for_doc(doc)
def update_couch_docs_by_collection(cid, fieldName, newValue): print >> sys.stderr, "UPDATING DOCS FOR COLLECTION: {}".format(cid) _couchdb = get_couchdb() rows = CouchDBCollectionFilter(collection_key=cid, couchdb_obj=_couchdb) ids = [row['id'] for row in rows] num_updated, updated_docs = update_by_id_list( ids, fieldName, newValue, _couchdb=_couchdb) subject = format_results_subject(cid, 'Updated documents from CouchDB {env} ') publish_to_harvesting( subject, 'Updated {} documents from CouchDB collection CID: {}'.format( num_updated, cid)) return num_updated, updated_docs
def main(url_solr=URL_SOLR, url_couchdb=None, couchdb_db=None): solr_db = solr.Solr(url_solr) db = get_couchdb(url=url_couchdb, dbname=couchdb_db) v = couchdb_pager(db, include_docs='true') # update or create new solr doc for each couchdb doc for r in v: doc_couch = r.doc if '_design' not in doc_couch['_id']: try: if not isinstance(doc_couch['originalRecord']['collection'], list): doc_couch['originalRecord']['collection'] = [ doc_couch['originalRecord']['collection'], ] print("orgRec.Collection: {}".format( doc_couch['sourceResource']['collection'])) except KeyError: pass try: if not isinstance(doc_couch['sourceResource']['collection'], list): doc_couch['sourceResource']['collection'] = [ doc_couch['sourceResource']['collection'], ] print("srcRes.Collection: {}".format( doc_couch['sourceResource']['subject'])) except KeyError: pass try: subject = doc_couch['sourceResource'].get('subject', None) if not isinstance(subject, list): subject = [subject] subjects_norm = [] for sub in subject: if not isinstance(sub, dict): subjects_norm.append({'name': sub}) else: subjects_norm.append(sub) doc_couch['sourceResource']['subject'] = subjects_norm except KeyError: pass db.save(doc_couch) try: doc_solr = push_doc_to_solr(map_couch_to_solr_doc(doc_couch), solr_db=solr_db) print("PUSHED {} to solr".format(doc_couch['_id'])) except TypeError: pass solr_db.commit()
def update_couch_docs_by_collection(cid, fieldName, newValue, substring): print >> sys.stderr, "UPDATING DOCS FOR COLLECTION: {}".format(cid) _couchdb = get_couchdb() rows = CouchDBCollectionFilter(collection_key=cid, couchdb_obj=_couchdb) ids = [row['id'] for row in rows] num_updated, updated_docs = update_by_id_list(ids, fieldName, newValue, substring, _couchdb=_couchdb) subject = format_results_subject(cid, 'Updated documents from CouchDB {env} ') publish_to_harvesting( subject, 'Updated {} documents from CouchDB collection CID: {}'.format( num_updated, cid)) return num_updated, updated_docs
def __init__(self, rq_queue=None): self._config = config() self._couchdb = get_couchdb() self._redis = Redis( host=self._config['redis_host'], port=self._config['redis_port'], password=self._config['redis_password'], socket_connect_timeout=self._config['redis_connect_timeout']) self.rqname = self._config['rq_queue'] if rq_queue: self.rqname = rq_queue if not self.rqname: raise ValueError(''.join( ('Must set RQ_QUEUE env var', ' or pass in rq_queue to ', 'CouchDBJobEnqueue'))) self._rQ = Queue(self.rqname, connection=self._redis)
def __init__(self, rq_queue=None): self._config = config() self._couchdb = get_couchdb() self._redis = Redis( host=self._config['redis_host'], port=self._config['redis_port'], password=self._config['redis_password'], socket_connect_timeout=self._config['redis_connect_timeout']) self.rqname = self._config['rq_queue'] if rq_queue: self.rqname = rq_queue if not self.rqname: raise ValueError(''.join(('Must set RQ_QUEUE env var', ' or pass in rq_queue to ', 'CouchDBJobEnqueue'))) self._rQ = Queue(self.rqname, connection=self._redis)
def __init__( self, cdb=None, url_couchdb=None, couchdb_name=None, couch_view=COUCHDB_VIEW, bucket_bases=BUCKET_BASES, object_auth=None, get_if_object=False, url_cache=None, hash_cache=None, harvested_object_cache=None, ): self._config = config() if cdb: self._couchdb = cdb else: if not url_couchdb: url_couchdb = self._config["couchdb_url"] self._couchdb = get_couchdb(url=url_couchdb, dbname=couchdb_name) self._bucket_bases = bucket_bases self._view = couch_view # auth is a tuple of username, password self._auth = object_auth self.get_if_object = get_if_object # if object field exists, get self._redis = Redis( host=self._config["redis_host"], port=self._config["redis_port"], password=self._config["redis_password"], socket_connect_timeout=self._config["redis_connect_timeout"], ) self._url_cache = ( url_cache if url_cache is not None else redis_collections.Dict(key="ucldc-image-url-cache", redis=self._redis) ) self._hash_cache = ( hash_cache if hash_cache is not None else redis_collections.Dict(key="ucldc-image-hash-cache", redis=self._redis) ) self._object_cache = ( harvested_object_cache if harvested_object_cache else redis_collections.Dict(key="ucldc:harvester:harvested-images", redis=self._redis) )
def main(collection_key): v = CouchDBCollectionFilter( couchdb_obj=get_couchdb(), collection_key=collection_key) solr_db = Solr(URL_SOLR) results = [] for r in v: dt_start = dt_end = datetime.datetime.now() try: doc = fill_in_title(r.doc) has_required_fields(r.doc) except KeyError, e: print(e.message) continue solr_doc = map_couch_to_solr_doc(r.doc) results.append(solr_doc) solr_doc = push_doc_to_solr(solr_doc, solr_db=solr_db) dt_end = datetime.datetime.now()
def main(collection_key): v = CouchDBCollectionFilter(couchdb_obj=get_couchdb(), collection_key=collection_key) solr_db = Solr(URL_SOLR) results = [] for r in v: dt_start = dt_end = datetime.datetime.now() try: doc = fill_in_title(r.doc) has_required_fields(r.doc) except KeyError, e: print(e.message) continue solr_doc = map_couch_to_solr_doc(r.doc) results.append(solr_doc) solr_doc = push_doc_to_solr(solr_doc, solr_db=solr_db) dt_end = datetime.datetime.now()
def main(url_solr=URL_SOLR, url_couchdb=None, couchdb_db=None): solr_db = solr.Solr(url_solr) db = get_couchdb(url=url_couchdb, dbname=couchdb_db) v = couchdb_pager(db, include_docs='true') # update or create new solr doc for each couchdb doc for r in v: doc_couch = r.doc if '_design' not in doc_couch['_id']: try: if not isinstance(doc_couch['originalRecord']['collection'], list): doc_couch['originalRecord']['collection'] = [ doc_couch['originalRecord']['collection'], ] print("orgRec.Collection: {}".format(doc_couch['sourceResource']['collection'])) except KeyError: pass try: if not isinstance(doc_couch['sourceResource']['collection'], list): doc_couch['sourceResource']['collection'] = [ doc_couch['sourceResource']['collection'], ] print("srcRes.Collection: {}".format(doc_couch['sourceResource']['subject'])) except KeyError: pass try: subject = doc_couch['sourceResource'].get('subject', None) if not isinstance(subject, list): subject = [subject] subjects_norm = [] for sub in subject: if not isinstance(sub, dict): subjects_norm.append({'name': sub}) else: subjects_norm.append(sub) doc_couch['sourceResource']['subject'] = subjects_norm except KeyError: pass db.save(doc_couch) try: doc_solr = push_doc_to_solr(map_couch_to_solr_doc(doc_couch), solr_db=solr_db) print("PUSHED {} to solr".format(doc_couch['_id'])) except TypeError: pass solr_db.commit()
def sync_couch_collection_to_solr(collection_key): # This works from inside an environment with default URLs for couch & solr URL_SOLR = os.environ.get('URL_SOLR', None) collection_key = str(collection_key) # Couch need string keys v = CouchDBCollectionFilter( couchdb_obj=get_couchdb(), collection_key=collection_key) solr_db = Solr(URL_SOLR) results = [] for r in v: try: fill_in_title(r.doc) has_required_fields(r.doc) except KeyError, e: print(e.message) continue solr_doc = map_couch_to_solr_doc(r.doc) results.append(solr_doc) solr_doc = push_doc_to_solr(solr_doc, solr_db=solr_db)
def harvest_image_for_doc(doc_id, url_couchdb=None, object_auth=None, get_if_object=False, force=False): '''Wrapper to call from rqworker. Creates ImageHarvester object & then calls harvest_image_for_doc ''' harvester = ImageHarvester( url_couchdb=url_couchdb, object_auth=object_auth, get_if_object=get_if_object) # get doc from couchdb couchdb = get_couchdb(url=url_couchdb) doc = couchdb[doc_id] if not get_if_object and 'object' in doc and not force: print >> sys.stderr, 'Skipping {}, has object field'.format(doc['_id']) else: harvester.harvest_image_for_doc(doc, force=force)
def harvest_image_for_doc(doc_id, url_couchdb=None, object_auth=None, get_if_object=False, force=False): '''Wrapper to call from rqworker. Creates ImageHarvester object & then calls harvest_image_for_doc ''' harvester = ImageHarvester(url_couchdb=url_couchdb, object_auth=object_auth, get_if_object=get_if_object, ignore_content_type=ignore_content_type) # get doc from couchdb couchdb = get_couchdb(url=url_couchdb) doc = couchdb[doc_id] if not get_if_object and 'object' in doc and not force: print >> sys.stderr, 'Skipping {}, has object field'.format(doc['_id']) else: harvester.harvest_image_for_doc(doc, force=force)
def __init__(self, cdb=None, url_couchdb=None, couchdb_name=None, couch_view=COUCHDB_VIEW, bucket_bases=BUCKET_BASES, object_auth=None, get_if_object=False, ignore_content_type=False, url_cache=None, hash_cache=None, harvested_object_cache=None): self._config = config() if cdb: self._couchdb = cdb else: if not url_couchdb: url_couchdb = self._config['couchdb_url'] self._couchdb = get_couchdb(url=url_couchdb, dbname=couchdb_name) self._bucket_bases = bucket_bases self._view = couch_view # auth is a tuple of username, password self._auth = object_auth self.get_if_object = get_if_object # if object field exists, get self.ignore_content_type = ignore_content_type # Don't check content-type in headers self._redis = Redis( host=self._config['redis_host'], port=self._config['redis_port'], password=self._config['redis_password'], socket_connect_timeout=self._config['redis_connect_timeout']) self._url_cache = url_cache if url_cache is not None else \ redis_collections.Dict(key='ucldc-image-url-cache', redis=self._redis) self._hash_cache = hash_cache if hash_cache is not None else \ redis_collections.Dict(key='ucldc-image-hash-cache', redis=self._redis) self._object_cache = harvested_object_cache if harvested_object_cache \ else \ redis_collections.Dict( key='ucldc:harvester:harvested-images', redis=self._redis)
def sync_couch_collection_to_solr(collection_key): # This works from inside an environment with default URLs for couch & solr URL_SOLR = os.environ.get('URL_SOLR', None) collection_key = str(collection_key) # Couch need string keys v = CouchDBCollectionFilter( couchdb_obj=get_couchdb(), collection_key=collection_key) solr_db = Solr(URL_SOLR) updated_docs = [] num_added = 0 report = defaultdict(int) for r in v: try: fill_in_title(r.doc) has_required_fields(r.doc) except KeyError as e: report[e.dict_key] += 1 print(e.message, file=sys.stderr) continue except ValueError as e: report[e.dict_key] += 1 print(e.message, file=sys.stderr) continue solr_doc = map_couch_to_solr_doc(r.doc) # TODO: here is where to check if existing and compare collection vals try: check_nuxeo_media(solr_doc) except ValueError as e: print(e.message, file=sys.stderr) report[e.dict_key] += 1 continue updated_docs.append(solr_doc) num_added += push_doc_to_solr(solr_doc, solr_db=solr_db) solr_db.commit() publish_to_harvesting( 'Synced collection {} to solr'.format(collection_key), harvesting_report( collection_key, updated_docs, num_added, report)) return updated_docs, report
def __init__(self, collection_key=None, couchdb_obj=None, url_couchdb=None, couchdb_name=None, couch_view=COUCHDB_VIEW, include_docs=True ): if not collection_key: collection_key = '{}' if couchdb_obj is None: if not url_couchdb or not couchdb_name: raise ValueError('Need url and name to couch database') self._couchdb = get_couchdb(url=url_couchdb, dbname=couchdb_name) else: self._couchdb = couchdb_obj self._view = couch_view self._view_iter = couchdb_pager( self._couchdb, self._view, key=collection_key, include_docs='true' if include_docs else 'false')
def run_on_couchdb_by_collection(func, collection_key=None): '''If collection_key is none, trying to grab all of docs and modify func is a function that takes a couchdb doc in and returns it modified. (can take long time - not recommended) Function should return new document or None if no changes made ''' _couchdb = get_couchdb() v = _couchdb.view(COUCHDB_VIEW, include_docs='true', key=collection_key) \ if collection_key else _couchdb.view(COUCHDB_VIEW, include_docs='true') doc_ids = [] n = 0 for r in v: n += 1 doc_new = func(r.doc) if doc_new and doc_new != doc: _couchdb.save(doc_new) doc_ids.append(r.doc['_id']) if n % 100 == 0: print '{} docs ran. Last doc:{}\n'.format(n, r.doc['_id']) return doc_ids
def __init__(self, collection_key=None, couchdb_obj=None, url_couchdb=None, couchdb_name=None, couch_view=COUCHDB_VIEW, include_docs=True): if not collection_key: collection_key = '{}' if couchdb_obj is None: if not url_couchdb or not couchdb_name: raise ValueError('Need url and name to couch database') self._couchdb = get_couchdb(url=url_couchdb, dbname=couchdb_name) else: self._couchdb = couchdb_obj self._view = couch_view self._view_iter = couchdb_pager( self._couchdb, self._view, key=collection_key, include_docs='true' if include_docs else 'false')
def sync_couch_collection_to_solr(collection_key): # This works from inside an environment with default URLs for couch & solr delete_solr_collection(collection_key) URL_SOLR = os.environ.get('URL_SOLR', None) collection_key = str(collection_key) # Couch need string keys v = CouchDBCollectionFilter(couchdb_obj=get_couchdb(), collection_key=collection_key) solr_db = Solr(URL_SOLR) updated_docs = [] num_added = 0 report = defaultdict(int) for r in v: try: fill_in_title(r.doc) has_required_fields(r.doc) except KeyError as e: report[e.dict_key] += 1 print(e.message, file=sys.stderr) continue except ValueError as e: report[e.dict_key] += 1 print(e.message, file=sys.stderr) continue solr_doc = map_couch_to_solr_doc(r.doc) # TODO: here is where to check if existing and compare collection vals try: check_nuxeo_media(solr_doc) except ValueError as e: print(e.message, file=sys.stderr) report[e.dict_key] += 1 continue updated_docs.append(solr_doc) num_added += push_doc_to_solr(solr_doc, solr_db=solr_db) solr_db.commit() publish_to_harvesting( 'Synced collection {} to solr'.format(collection_key), harvesting_report(collection_key, updated_docs, num_added, report)) return updated_docs, report
description='Make csv report of indexed collections') parser.add_argument('auth_token', help='Authentication token') parser.add_argument('--solr_url', help='Solr index url') parser.add_argument('--couchdb_url', help='CouchDB url') args = parser.parse_args() solr_url = args.solr_url if args.solr_url else SOLR_URL print "SOLR_URL:{}".format(solr_url) SOLR = solr.SearchHandler( solr.Solr( solr_url, post_headers={ 'X-Authentication-Token': args.auth_token, }, ), "/query") if args.couchdb_url: cdb = get_couchdb(url_couchdb=couchdb_url, dbname='ucldc') else: cdb = get_couchdb(dbname='ucldc') collections = get_indexed_collection_list(SOLR) date_to_minute = datetime.datetime.now().strftime('%Y%m%d-%H%M') fname = 'indexed_collections-{}.csv'.format(date_to_minute) with open(fname, 'wb') as csvfile: csvwriter = UnicodeWriter(csvfile) csvwriter.writerow( ('Collection Name', 'Collection URL', 'Number in index', 'Number in couchdb', 'Number in OAC', 'Couch missing in solr', 'OAC missing in couch', 'Repository Name', 'Repository URL', 'Campus')) for c_url, num in collections: try: c = Collection(c_url)
def main(url_couchdb=None, dbname=None, url_solr=None, all_docs=False, since=None): '''Use the _changes feed with a "since" parameter to only catch new changes to docs. The _changes feed will only have the *last* event on a document and does not retain intermediate changes. Setting the "since" to 0 will result in getting a _changes record for each document, essentially dumping the db to solr ''' print('Solr update PID: {}'.format(os.getpid())) dt_start = datetime.datetime.now() print('Start time:{}'.format(dt_start)) sys.stdout.flush() # put pd db = get_couchdb(url=url_couchdb, dbname=dbname) s3_seq_cache = CouchdbLastSeq_S3() if not since: since = s3_seq_cache.last_seq if all_docs: since = '0' print('Attempt to connect to {0} - db:{1}'.format(url_couchdb, dbname)) print('Getting changes since:{}'.format(since)) sys.stdout.flush() # put pd db = get_couchdb(url=url_couchdb, dbname=dbname) changes = db.changes(since=since) previous_since = since last_since = int( changes['last_seq']) # get new last_since for changes feed results = changes['results'] n_up = n_design = n_delete = 0 solr_db = Solr(url_solr) start_time = datetime.datetime.now() for row in results: cur_id = row['id'] if '_design' in cur_id: n_design += 1 print("Skip {0}".format(cur_id)) continue if row.get('deleted', False): # need to get the solr doc for this couch resp = solr_db.select(q=''.join(('harvest_id_s:"', cur_id, '"'))) if resp.numFound == 1: sdoc = resp.results[0] print('====DELETING: {0} -- {1}'.format(cur_id, sdoc['id'])) solr_db.delete(id=sdoc['id']) n_delete += 1 else: print("-----DELETION of {} - FOUND {} docs".format( cur_id, resp.numFound)) else: doc = db.get(cur_id) try: doc = fill_in_title(doc) has_required_fields(doc) except KeyError as e: print(e.message) continue except ValueError as e: print(e.message) continue try: try: solr_doc = map_couch_to_solr_doc(doc) except OldCollectionException: print('---- ERROR: OLD COLLECTION FOR:{}'.format(cur_id)) continue try: check_nuxeo_media(solr_doc) except ValueError as e: print(e.message) continue solr_doc = push_doc_to_solr(solr_doc, solr_db=solr_db) except TypeError as e: print('TypeError for {0} : {1}'.format(cur_id, e)) continue n_up += 1 if n_up % 1000 == 0: elapsed_time = datetime.datetime.now() - start_time print("Updated {} so far in {}".format(n_up, elapsed_time)) solr_db.commit() if not all_docs: s3_seq_cache.last_seq = last_since print("UPDATED {0} DOCUMENTS. DELETED:{1}".format(n_up, n_delete)) print("PREVIOUS SINCE:{0}".format(previous_since)) print("LAST SINCE:{0}".format(last_since)) run_time = datetime.datetime.now() - dt_start print("RUN TIME:{}".format(run_time))
parser.add_argument('--solr_url', help='Solr index url') parser.add_argument('--couchdb_url', help='CouchDB url') args = parser.parse_args() solr_url = args.solr_url if args.solr_url else SOLR_URL print "SOLR_URL:{}".format(solr_url) SOLR = solr.SearchHandler( solr.Solr( solr_url, post_headers = { 'X-Authentication-Token': args.auth_token, }, ), "/query" ) if args.couchdb_url: cdb = get_couchdb(url_couchdb=couchdb_url, dbname='ucldc') else: cdb = get_couchdb(dbname='ucldc') collections = get_indexed_collection_list(SOLR) date_to_minute = datetime.datetime.now().strftime('%Y%m%d-%H%M') fname = 'indexed_collections-{}.csv'.format(date_to_minute) with open(fname, 'wb') as csvfile: csvwriter = UnicodeWriter(csvfile) csvwriter.writerow(('Collection Name', 'Collection URL', 'Number in index', 'Number in couchdb', 'Number in OAC', 'Couch missing in solr', 'OAC missing in couch', 'Repository Name', 'Repository URL', 'Campus')) for c_url, num in collections: try: c = Collection(c_url)
def __init__(self): self._couchdb = get_couchdb()
def main(doc_id, enrichment, port=8889): '''Run akara_enrich_doc for one document and save result''' _couchdb = get_couchdb() indoc = _couchdb.get(doc_id) doc = akara_enrich_doc(indoc, enrichment, port) _couchdb[doc_id] = doc
'''one time script to populate redis with harvested image object data''' from harvester.config import config from harvester.couchdb_init import get_couchdb from harvester.couchdb_pager import couchdb_pager from redis import Redis import redis_collections _config = config() _redis = Redis(host=_config['redis_host'], port=_config['redis_port'], password=_config['redis_password'], socket_connect_timeout=_config['redis_connect_timeout']) object_cache = redis_collections.Dict(key='ucldc:harvester:harvested-images', redis=_redis) _couchdb = get_couchdb(url=_config['couchdb_url'], dbname='ucldc') v = couchdb_pager(_couchdb, include_docs='true') for r in v: doc = r.doc if 'object' in doc: did = doc['_id'] if 'object_dimensions' not in doc: print "NO DIMS for {} -- not caching".format(did) else: object_cache[did] = [doc['object'], doc['object_dimensions']] print "OBJECT CACHE : {} === {}".format(did, object_cache[did])
import sys import argparse from harvester.couchdb_init import get_couchdb from harvester.couchdb_sync_db_by_collection import delete_id_list if __name__ == '__main__': parser = argparse.ArgumentParser( description='Delete all documents in given collection') parser.add_argument('id_list', help='File with ids in it, one per line') args = parser.parse_args(sys.argv[1:]) ids = [] with open(args.id_list) as id_file: ids = [l.strip() for l in id_file.readlines()] _couchdb = get_couchdb() num_deleted, delete_ids = delete_id_list(ids, _couchdb=_couchdb) print 'Deleted {} documents'.format(num_deleted) # Copyright © 2016, Regents of the University of California # All rights reserved. # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # - Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # - Neither the name of the University of California nor the names of its # contributors may be used to endorse or promote products derived from this # software without specific prior written permission.