def testConfig(self): with self.assertRaises(KeyError) as cm: config(redis_required=True) self.assertEqual( str(cm.exception.message), 'Please set environment variable REDIS_PASSWORD to redis ' 'password!') os.environ['REDIS_HOST'] = 'redis_host_ip' os.environ['REDIS_PASSWORD'] = '******' conf = config() self.assertEqual(conf['redis_host'], 'redis_host_ip') self.assertEqual(conf['redis_port'], '6380') self.assertEqual(conf['redis_password'], 'XX') self.assertEqual(conf['redis_connect_timeout'], 10)
def get_couchdb(url=None, dbname=None, username=None, password=None): '''Get a couchdb library Server object returns a ''' env = config() if not dbname: dbname = env.get('couchdb_dbname', None) if not dbname: dbname = 'ucldc' couchdb_server = get_couch_server(url, username, password) return couchdb_server[dbname]
def testConfigValues(self): cfg = config.config() self.assertEqual(cfg['redis_host'], 'test_redis_host') self.assertEqual(cfg['redis_port'], 'test_redis_port') self.assertEqual(cfg['redis_connect_timeout'], 'test_redis_timeout') self.assertEqual(cfg['redis_password'], 'test_redis_password') self.assertEqual(cfg['couchdb_url'], 'test_couchdb_url') self.assertEqual(cfg['couchdb_username'], 'test_couchdb_user') self.assertEqual(cfg['couchdb_password'], 'test_couchdb_password') self.assertEqual(cfg['couchdb_dbname'], 'test_couchdb_dbname') self.assertEqual(cfg['couchdb_dashboard'], 'test_couchdb_dashname')
def testMain(self, mock_enrich_doc): ''' main in enrich_existing_couchd_doc takes a doc _id and the enrichment chain to run. It then downloads the doc, submits it for enrichment and then saves the resulting document. ''' conf = config() self.url_couch_base = conf['couchdb_url'] self.cdb = conf['couchdb_dbname'] url_couchdb = os.path.join(self.url_couch_base, self.cdb) httpretty.register_uri(httpretty.HEAD, url_couchdb, body='', content_length='0', content_type='text/plain; charset=utf-8', connection='close', server='CouchDB/1.5.0 (Erlang OTP/R16B03)', cache_control='must-revalidate', date='Mon, 24 Nov 2014 21:30:38 GMT') url_doc = os.path.join( url_couchdb, '5112--http%3A%2F%2Fark.cdlib.org%2Fark%3A%2F13030%2Fkt7580382j') doc_returned = open(DIR_FIXTURES + '/couchdb_doc.json').read() httpretty.register_uri( httpretty.GET, url_doc, body=doc_returned, etag="2U5BW2TDDX9EHZJOO0DNE29D1", content_type='application/json', connection='close', ) httpretty.register_uri( httpretty.PUT, url_doc, status=201, body= '{"ok":true, "id":"5112--http://ark.cdlib.org/ark:/13030/kt7580382j", "rev":"123456789"}', content_type='application/json', etag="123456789", connection='close', ) httpretty.register_uri( httpretty.POST, 'http://localhost:8889/enrich', body=open(DIR_FIXTURES + '/akara_response.json').read(), ) mock_enrich_doc.return_value = json.loads(doc_returned) main('5112--http://ark.cdlib.org/ark:/13030/kt7580382j', '/select-oac-id,dpla_mapper?mapper_type=oac_dc') mock_enrich_doc.assert_called_with( json.loads(doc_returned), '/select-oac-id,dpla_mapper?mapper_type=oac_dc', 8889)
def __init__(self, rq_queue=None): self._config = config() self._couchdb = get_couchdb() self._redis = Redis( host=self._config['redis_host'], port=self._config['redis_port'], password=self._config['redis_password'], socket_connect_timeout=self._config['redis_connect_timeout']) self.rqname = self._config['rq_queue'] if rq_queue: self.rqname = rq_queue if not self.rqname: raise ValueError(''.join(('Must set RQ_QUEUE env var', ' or pass in rq_queue to ', 'CouchDBJobEnqueue'))) self._rQ = Queue(self.rqname, connection=self._redis)
def __init__(self, rq_queue=None): self._config = config() self._couchdb = get_couchdb() self._redis = Redis( host=self._config['redis_host'], port=self._config['redis_port'], password=self._config['redis_password'], socket_connect_timeout=self._config['redis_connect_timeout']) self.rqname = self._config['rq_queue'] if rq_queue: self.rqname = rq_queue if not self.rqname: raise ValueError(''.join( ('Must set RQ_QUEUE env var', ' or pass in rq_queue to ', 'CouchDBJobEnqueue'))) self._rQ = Queue(self.rqname, connection=self._redis)
def __init__( self, cdb=None, url_couchdb=None, couchdb_name=None, couch_view=COUCHDB_VIEW, bucket_bases=BUCKET_BASES, object_auth=None, get_if_object=False, url_cache=None, hash_cache=None, harvested_object_cache=None, ): self._config = config() if cdb: self._couchdb = cdb else: if not url_couchdb: url_couchdb = self._config["couchdb_url"] self._couchdb = get_couchdb(url=url_couchdb, dbname=couchdb_name) self._bucket_bases = bucket_bases self._view = couch_view # auth is a tuple of username, password self._auth = object_auth self.get_if_object = get_if_object # if object field exists, get self._redis = Redis( host=self._config["redis_host"], port=self._config["redis_port"], password=self._config["redis_password"], socket_connect_timeout=self._config["redis_connect_timeout"], ) self._url_cache = ( url_cache if url_cache is not None else redis_collections.Dict(key="ucldc-image-url-cache", redis=self._redis) ) self._hash_cache = ( hash_cache if hash_cache is not None else redis_collections.Dict(key="ucldc-image-hash-cache", redis=self._redis) ) self._object_cache = ( harvested_object_cache if harvested_object_cache else redis_collections.Dict(key="ucldc:harvester:harvested-images", redis=self._redis) )
def testMain(self, mock_enrich_doc): ''' main in enrich_existing_couchd_doc takes a doc _id and the enrichment chain to run. It then downloads the doc, submits it for enrichment and then saves the resulting document. ''' conf = config() self.url_couch_base = conf['couchdb_url'] self.cdb = conf['couchdb_dbname'] url_couchdb = os.path.join(self.url_couch_base, self.cdb) httpretty.register_uri(httpretty.HEAD, url_couchdb, body='', content_length='0', content_type='text/plain; charset=utf-8', connection='close', server='CouchDB/1.5.0 (Erlang OTP/R16B03)', cache_control='must-revalidate', date='Mon, 24 Nov 2014 21:30:38 GMT' ) url_doc = os.path.join(url_couchdb, '5112--http%3A%2F%2Fark.cdlib.org%2Fark%3A%2F13030%2Fkt7580382j') doc_returned = open(DIR_FIXTURES+'/couchdb_doc.json').read() httpretty.register_uri(httpretty.GET, url_doc, body=doc_returned, etag="2U5BW2TDDX9EHZJOO0DNE29D1", content_type='application/json', connection='close', ) httpretty.register_uri(httpretty.PUT, url_doc, status=201, body='{"ok":true, "id":"5112--http://ark.cdlib.org/ark:/13030/kt7580382j", "rev":"123456789"}', content_type='application/json', etag="123456789", connection='close', ) httpretty.register_uri(httpretty.POST, 'http://localhost:8889/enrich', body=open(DIR_FIXTURES+'/akara_response.json').read(), ) mock_enrich_doc.return_value = json.loads(doc_returned) main('5112--http://ark.cdlib.org/ark:/13030/kt7580382j', '/select-oac-id,dpla_mapper?mapper_type=oac_dc') mock_enrich_doc.assert_called_with(json.loads(doc_returned), '/select-oac-id,dpla_mapper?mapper_type=oac_dc', 8889)
def get_couch_server(url=None, username=None, password=None): '''Returns a couchdb library Server object''' env = config() if not url: url = env['couchdb_url'] if username is None: username = env.get('couchdb_username', None) if password is None: password = env.get('couchdb_password', None) if username: schema, uri = url.split("//") url = "{0}//{1}:{2}@{3}".format(schema, username, password, uri) py_version = sys.version_info if py_version.major == 2 and py_version.minor == 7 and py_version.micro > 8: #disable ssl verification import ssl ssl._create_default_https_context = ssl._create_unverified_context print "URL:{}".format(url) return couchdb.Server(url)
def setUp(self): self.conf = config() self.url_couch_base = self.conf['couchdb_url'] self.cdb = self.conf['couchdb_dbname'] url_head = os.path.join(self.url_couch_base, self.cdb) httpretty.register_uri(httpretty.HEAD, url_head, body='', content_length='0', content_type='text/plain; charset=utf-8', connection='close', server='CouchDB/1.5.0 (Erlang OTP/R16B03)', cache_control='must-revalidate', date='Mon, 24 Nov 2014 21:30:38 GMT' ) self._cdbworker = CouchDBWorker() def func_for_test(doc, *args, **kwargs): return doc, args, kwargs self.function = func_for_test
def setUp(self): self.conf = config() self.url_couch_base = self.conf['couchdb_url'] self.cdb = self.conf['couchdb_dbname'] url_head = os.path.join(self.url_couch_base, self.cdb) httpretty.register_uri(httpretty.HEAD, url_head, body='', content_length='0', content_type='text/plain; charset=utf-8', connection='close', server='CouchDB/1.5.0 (Erlang OTP/R16B03)', cache_control='must-revalidate', date='Mon, 24 Nov 2014 21:30:38 GMT') self._cdbworker = CouchDBWorker() def func_for_test(doc, *args, **kwargs): return doc, args, kwargs self.function = func_for_test
def __init__(self, cdb=None, url_couchdb=None, couchdb_name=None, couch_view=COUCHDB_VIEW, bucket_bases=BUCKET_BASES, object_auth=None, get_if_object=False, ignore_content_type=False, url_cache=None, hash_cache=None, harvested_object_cache=None): self._config = config() if cdb: self._couchdb = cdb else: if not url_couchdb: url_couchdb = self._config['couchdb_url'] self._couchdb = get_couchdb(url=url_couchdb, dbname=couchdb_name) self._bucket_bases = bucket_bases self._view = couch_view # auth is a tuple of username, password self._auth = object_auth self.get_if_object = get_if_object # if object field exists, get self.ignore_content_type = ignore_content_type # Don't check content-type in headers self._redis = Redis( host=self._config['redis_host'], port=self._config['redis_port'], password=self._config['redis_password'], socket_connect_timeout=self._config['redis_connect_timeout']) self._url_cache = url_cache if url_cache is not None else \ redis_collections.Dict(key='ucldc-image-url-cache', redis=self._redis) self._hash_cache = hash_cache if hash_cache is not None else \ redis_collections.Dict(key='ucldc-image-hash-cache', redis=self._redis) self._object_cache = harvested_object_cache if harvested_object_cache \ else \ redis_collections.Dict( key='ucldc:harvester:harvested-images', redis=self._redis)
import requests from harvester.config import config from harvester.scripts.queue_harvest import main as queue_harvest env=config() c_prod=[] c_harvest=[] url_reg = "https://registry.cdlib.org" url_reg_api = '{}{}'.format(url_reg, "/api/v1/collection/") url='{}{}'.format(url_reg_api, "?format=json&limit=1000") resp=requests.get(url) api=resp.json() nextpage=api['meta']['next'] print "NEXTPAGE:{}".format(nextpage) while nextpage: for o in api['objects']: if o['ready_for_publication']: c_prod.append(o) url_api_collection = '{}{}/'.format(url_reg_api, o['id']) print url_api_collection queue_harvest('*****@*****.**', url_api_collection, redis_host=env['redis_host'], redis_port=env['redis_port'], redis_pswd=env['redis_password'], rq_queue='normal-production') if o['url_harvest']: c_harvest.append(o) resp = requests.get(''.join(('https://registry.cdlib.org', nextpage))) api = resp.json() nextpage=api['meta']['next'] print "NEXTPAGE:{}".format(nextpage)
'''one time script to populate redis with harvested image object data''' from harvester.config import config from harvester.couchdb_init import get_couchdb from harvester.couchdb_pager import couchdb_pager from redis import Redis import redis_collections _config = config() _redis = Redis(host=_config['redis_host'], port=_config['redis_port'], password=_config['redis_password'], socket_connect_timeout=_config['redis_connect_timeout']) object_cache = redis_collections.Dict(key='ucldc:harvester:harvested-images', redis=_redis) _couchdb = get_couchdb(url=_config['couchdb_url'], dbname='ucldc') v = couchdb_pager(_couchdb, include_docs='true') for r in v: doc = r.doc if 'object' in doc: did = doc['_id'] if 'object_dimensions' not in doc: print "NO DIMS for {} -- not caching".format(did) else: object_cache[did] = [doc['object'], doc['object_dimensions']] print "OBJECT CACHE : {} === {}".format(did, object_cache[did])
from harvester.collection_registry_client import ResourceIterator from harvester.collection_registry_client import url_base, api_path from harvester.config import config from harvester.scripts.queue_harvest import main as queue_harvest for c in ResourceIterator(url_base, api_path + 'collection', 'collection'): if c.harvest_type != 'X': print c.name, c.slug, c.harvest_type, c.url_harvest env = config() queue_harvest('*****@*****.**', url_base + c.resource_uri, redis_host=env['redis_host'], redis_port=env['redis_port'], redis_pswd=env['redis_password'], id_ec2_ingest=env['id_ec2_ingest'], id_ec2_solr=env['id_ec2_solr_build'], job_timeout=6000)