def by_collection(self, collection_key=None): '''If collection_key is none, trying to grab all of the images. (Not recommended) ''' if collection_key: v = couchdb_pager( self._couchdb, view_name=self._view, startkey='"{0}"'.format(collection_key), endkey='"{0}"'.format(collection_key), include_docs='true') else: # use _all_docs view v = couchdb_pager(self._couchdb, include_docs='true') doc_ids = [] report_errors = defaultdict(list) for r in v: dt_start = dt_end = datetime.datetime.now() try: reports = self.harvest_image_for_doc(r.doc) except ImageHarvestError as e: report_errors[e.dict_key].append((e.doc_id, str(e))) doc_ids.append(r.doc['_id']) dt_end = datetime.datetime.now() time.sleep((dt_end - dt_start).total_seconds()) report_list = [ ' : '.join((key, str(val))) for key, val in report_errors.items() ] report_msg = '\n'.join(report_list) subject = format_results_subject(collection_key, 'Image harvest to CouchDB {env}') publish_to_harvesting(subject, ''.join( ('Processed {} documents\n'.format(len(doc_ids)), report_msg))) return doc_ids, report_errors
def by_collection(self, collection_key=None): """If collection_key is none, trying to grab all of the images. (Not recommended) """ if collection_key: v = couchdb_pager( self._couchdb, view_name=self._view, startkey='"{0}"'.format(collection_key), endkey='"{0}"'.format(collection_key), include_docs="true", ) else: # use _all_docs view v = couchdb_pager(self._couchdb, include_docs="true") doc_ids = [] for r in v: dt_start = dt_end = datetime.datetime.now() reports = self.harvest_image_for_doc(r.doc) doc_ids.append(r.doc["_id"]) dt_end = datetime.datetime.now() time.sleep((dt_end - dt_start).total_seconds()) publish_to_harvesting( "Image harvested {}".format(collection_key), "Processed {} documents".format(len(doc_ids)) ) return doc_ids
def main(collection_key=None, url_couchdb=SERVER_COUCHDB): '''If collection_key is none, trying to grab all of the images. (Not recommended) ''' s = couchdb.Server(url=url_couchdb) db = s[DB_COUCHDB] #v = db.view(COUCH_VIEW, include_docs='true', key=collection_key) if collection_key else db.view(COUCH_VIEW, include_docs='true') v = couchdb_pager( db, view_name=COUCH_VIEW, include_docs='true', key=collection_key) if collection_key else couchdb_pager( db, view_name=COUCH_VIEW, include_docs='true') for r in v: doc = r.doc msg = doc['_id'] if 's3://' in doc.get('object', ''): #already downloaded msg = ' '.join((msg, 'already fetched image')) continue try: doc['isShownBy'] = doc.get('isShownBy', get_isShownBy(doc)) except Exception, e: print("ERROR: Can't get isShownBy for {} : {}".format( doc['_id'], e)) continue #next doc try: url_image = doc['isShownBy']['src'] dt_start = dt_end = datetime.datetime.now() report = md5s3stash(url_image, bucket_base=BUCKET_BASE) dt_end = datetime.datetime.now() doc['object'] = report.s3_url db.save(doc) msg = ' '.join((msg, doc['object'])) except KeyError, e: msg = ' '.join((msg, "ERROR: No isShownBy field"))
def by_collection(self, collection_key=None): '''If collection_key is none, trying to grab all of the images. (Not recommended) ''' if collection_key: v = couchdb_pager(self._couchdb, view_name=self._view, startkey='"{0}"'.format(collection_key), endkey='"{0}"'.format(collection_key), include_docs='true') else: # use _all_docs view v = couchdb_pager(self._couchdb, include_docs='true') doc_ids = [] report_errors = defaultdict(list) for r in v: dt_start = dt_end = datetime.datetime.now() try: reports = self.harvest_image_for_doc(r.doc) except ImageHarvestError as e: report_errors[e.dict_key].append((e.doc_id, str(e))) doc_ids.append(r.doc['_id']) dt_end = datetime.datetime.now() time.sleep((dt_end - dt_start).total_seconds()) report_list = [ ' : '.join((key, str(val))) for key, val in report_errors.items() ] report_msg = '\n'.join(report_list) subject = format_results_subject(collection_key, 'Image harvest to CouchDB {env}') publish_to_harvesting( subject, ''.join( ('Processed {} documents\n'.format(len(doc_ids)), report_msg))) return doc_ids, report_errors
def main(collection_key=None, url_couchdb=SERVER_COUCHDB): '''If collection_key is none, trying to grab all of the images. (Not recommended) ''' s = couchdb.Server(url=url_couchdb) db = s[DB_COUCHDB] #v = db.view(COUCH_VIEW, include_docs='true', key=collection_key) if collection_key else db.view(COUCH_VIEW, include_docs='true') v = couchdb_pager(db, view_name=COUCH_VIEW, include_docs='true', key=collection_key) if collection_key else couchdb_pager(db, view_name=COUCH_VIEW, include_docs='true') for r in v: doc = r.doc msg = doc['_id'] if 's3://' in doc.get('object', ''): #already downloaded msg = ' '.join((msg, 'already fetched image')) continue try: doc['isShownBy'] = doc.get('isShownBy', get_isShownBy(doc)) except Exception, e: print("ERROR: Can't get isShownBy for {} : {}".format(doc['_id'], e)) continue #next doc try: url_image = doc['isShownBy']['src'] dt_start = dt_end = datetime.datetime.now() report = md5s3stash(url_image, bucket_base=BUCKET_BASE) dt_end = datetime.datetime.now() doc['object'] = report.s3_url db.save(doc) msg = ' '.join((msg, doc['object'])) except KeyError, e: msg = ' '.join((msg, "ERROR: No isShownBy field"))
def main(url_solr=URL_SOLR, url_couchdb=None, couchdb_db=None): solr_db = solr.Solr(url_solr) db = get_couchdb(url=url_couchdb, dbname=couchdb_db) v = couchdb_pager(db, include_docs='true') # update or create new solr doc for each couchdb doc for r in v: doc_couch = r.doc if '_design' not in doc_couch['_id']: try: if not isinstance(doc_couch['originalRecord']['collection'], list): doc_couch['originalRecord']['collection'] = [ doc_couch['originalRecord']['collection'], ] print("orgRec.Collection: {}".format( doc_couch['sourceResource']['collection'])) except KeyError: pass try: if not isinstance(doc_couch['sourceResource']['collection'], list): doc_couch['sourceResource']['collection'] = [ doc_couch['sourceResource']['collection'], ] print("srcRes.Collection: {}".format( doc_couch['sourceResource']['subject'])) except KeyError: pass try: subject = doc_couch['sourceResource'].get('subject', None) if not isinstance(subject, list): subject = [subject] subjects_norm = [] for sub in subject: if not isinstance(sub, dict): subjects_norm.append({'name': sub}) else: subjects_norm.append(sub) doc_couch['sourceResource']['subject'] = subjects_norm except KeyError: pass db.save(doc_couch) try: doc_solr = push_doc_to_solr(map_couch_to_solr_doc(doc_couch), solr_db=solr_db) print("PUSHED {} to solr".format(doc_couch['_id'])) except TypeError: pass solr_db.commit()
def main(url_solr=URL_SOLR, url_couchdb=None, couchdb_db=None): solr_db = solr.Solr(url_solr) db = get_couchdb(url=url_couchdb, dbname=couchdb_db) v = couchdb_pager(db, include_docs='true') # update or create new solr doc for each couchdb doc for r in v: doc_couch = r.doc if '_design' not in doc_couch['_id']: try: if not isinstance(doc_couch['originalRecord']['collection'], list): doc_couch['originalRecord']['collection'] = [ doc_couch['originalRecord']['collection'], ] print("orgRec.Collection: {}".format(doc_couch['sourceResource']['collection'])) except KeyError: pass try: if not isinstance(doc_couch['sourceResource']['collection'], list): doc_couch['sourceResource']['collection'] = [ doc_couch['sourceResource']['collection'], ] print("srcRes.Collection: {}".format(doc_couch['sourceResource']['subject'])) except KeyError: pass try: subject = doc_couch['sourceResource'].get('subject', None) if not isinstance(subject, list): subject = [subject] subjects_norm = [] for sub in subject: if not isinstance(sub, dict): subjects_norm.append({'name': sub}) else: subjects_norm.append(sub) doc_couch['sourceResource']['subject'] = subjects_norm except KeyError: pass db.save(doc_couch) try: doc_solr = push_doc_to_solr(map_couch_to_solr_doc(doc_couch), solr_db=solr_db) print("PUSHED {} to solr".format(doc_couch['_id'])) except TypeError: pass solr_db.commit()
def __init__(self, collection_key=None, couchdb_obj=None, url_couchdb=None, couchdb_name=None, couch_view=COUCHDB_VIEW, include_docs=True): if not collection_key: collection_key = '{}' if couchdb_obj is None: if not url_couchdb or not couchdb_name: raise ValueError('Need url and name to couch database') self._couchdb = get_couchdb(url=url_couchdb, dbname=couchdb_name) else: self._couchdb = couchdb_obj self._view = couch_view self._view_iter = couchdb_pager( self._couchdb, self._view, key=collection_key, include_docs='true' if include_docs else 'false')
def __init__(self, collection_key=None, couchdb_obj=None, url_couchdb=None, couchdb_name=None, couch_view=COUCHDB_VIEW, include_docs=True ): if not collection_key: collection_key = '{}' if couchdb_obj is None: if not url_couchdb or not couchdb_name: raise ValueError('Need url and name to couch database') self._couchdb = get_couchdb(url=url_couchdb, dbname=couchdb_name) else: self._couchdb = couchdb_obj self._view = couch_view self._view_iter = couchdb_pager( self._couchdb, self._view, key=collection_key, include_docs='true' if include_docs else 'false')
'''one time script to populate redis with harvested image object data''' from harvester.config import config from harvester.couchdb_init import get_couchdb from harvester.couchdb_pager import couchdb_pager from redis import Redis import redis_collections _config = config() _redis = Redis(host=_config['redis_host'], port=_config['redis_port'], password=_config['redis_password'], socket_connect_timeout=_config['redis_connect_timeout']) object_cache = redis_collections.Dict(key='ucldc:harvester:harvested-images', redis=_redis) _couchdb = get_couchdb(url=_config['couchdb_url'], dbname='ucldc') v = couchdb_pager(_couchdb, include_docs='true') for r in v: doc = r.doc if 'object' in doc: did = doc['_id'] if 'object_dimensions' not in doc: print "NO DIMS for {} -- not caching".format(did) else: object_cache[did] = [doc['object'], doc['object_dimensions']] print "OBJECT CACHE : {} === {}".format(did, object_cache[did])