Exemplo n.º 1
0
 def testConfig(self):
     with self.assertRaises(KeyError) as cm:
         config(redis_required=True)
     self.assertEqual(
         str(cm.exception.message),
         'Please set environment variable REDIS_PASSWORD to redis '
         'password!')
     os.environ['REDIS_HOST'] = 'redis_host_ip'
     os.environ['REDIS_PASSWORD'] = '******'
     conf = config()
     self.assertEqual(conf['redis_host'], 'redis_host_ip')
     self.assertEqual(conf['redis_port'], '6380')
     self.assertEqual(conf['redis_password'], 'XX')
     self.assertEqual(conf['redis_connect_timeout'], 10)
Exemplo n.º 2
0
 def testConfig(self):
     with self.assertRaises(KeyError) as cm:
         config(redis_required=True)
     self.assertEqual(
         str(cm.exception.message),
         'Please set environment variable REDIS_PASSWORD to redis '
         'password!')
     os.environ['REDIS_HOST'] = 'redis_host_ip'
     os.environ['REDIS_PASSWORD'] = '******'
     conf = config()
     self.assertEqual(conf['redis_host'], 'redis_host_ip')
     self.assertEqual(conf['redis_port'], '6380')
     self.assertEqual(conf['redis_password'], 'XX')
     self.assertEqual(conf['redis_connect_timeout'], 10)
Exemplo n.º 3
0
def get_couchdb(url=None, dbname=None, username=None, password=None):
    '''Get a couchdb library Server object
    returns a 
    '''
    env = config()
    if not dbname:
        dbname = env.get('couchdb_dbname', None)
        if not dbname:
            dbname = 'ucldc'
    couchdb_server = get_couch_server(url, username, password)
    return couchdb_server[dbname]
Exemplo n.º 4
0
def get_couchdb(url=None, dbname=None, username=None, password=None):
    '''Get a couchdb library Server object
    returns a 
    '''
    env = config()
    if not dbname:
        dbname = env.get('couchdb_dbname', None)
        if not dbname:
            dbname = 'ucldc'
    couchdb_server = get_couch_server(url, username, password)
    return couchdb_server[dbname]
Exemplo n.º 5
0
 def testConfigValues(self):
     cfg = config.config()
     self.assertEqual(cfg['redis_host'], 'test_redis_host')
     self.assertEqual(cfg['redis_port'], 'test_redis_port') 
     self.assertEqual(cfg['redis_connect_timeout'], 'test_redis_timeout') 
     self.assertEqual(cfg['redis_password'], 'test_redis_password')
     self.assertEqual(cfg['couchdb_url'], 'test_couchdb_url')
     self.assertEqual(cfg['couchdb_username'], 'test_couchdb_user')
     self.assertEqual(cfg['couchdb_password'], 'test_couchdb_password')
     self.assertEqual(cfg['couchdb_dbname'], 'test_couchdb_dbname')
     self.assertEqual(cfg['couchdb_dashboard'], 'test_couchdb_dashname')
Exemplo n.º 6
0
 def testMain(self, mock_enrich_doc):
     ''' main in enrich_existing_couchd_doc takes a doc _id and
     the enrichment chain to run. It then downloads the doc, submits it
     for enrichment and then saves the resulting document.
     '''
     conf = config()
     self.url_couch_base = conf['couchdb_url']
     self.cdb = conf['couchdb_dbname']
     url_couchdb = os.path.join(self.url_couch_base, self.cdb)
     httpretty.register_uri(httpretty.HEAD,
                            url_couchdb,
                            body='',
                            content_length='0',
                            content_type='text/plain; charset=utf-8',
                            connection='close',
                            server='CouchDB/1.5.0 (Erlang OTP/R16B03)',
                            cache_control='must-revalidate',
                            date='Mon, 24 Nov 2014 21:30:38 GMT')
     url_doc = os.path.join(
         url_couchdb,
         '5112--http%3A%2F%2Fark.cdlib.org%2Fark%3A%2F13030%2Fkt7580382j')
     doc_returned = open(DIR_FIXTURES + '/couchdb_doc.json').read()
     httpretty.register_uri(
         httpretty.GET,
         url_doc,
         body=doc_returned,
         etag="2U5BW2TDDX9EHZJOO0DNE29D1",
         content_type='application/json',
         connection='close',
     )
     httpretty.register_uri(
         httpretty.PUT,
         url_doc,
         status=201,
         body=
         '{"ok":true, "id":"5112--http://ark.cdlib.org/ark:/13030/kt7580382j", "rev":"123456789"}',
         content_type='application/json',
         etag="123456789",
         connection='close',
     )
     httpretty.register_uri(
         httpretty.POST,
         'http://localhost:8889/enrich',
         body=open(DIR_FIXTURES + '/akara_response.json').read(),
     )
     mock_enrich_doc.return_value = json.loads(doc_returned)
     main('5112--http://ark.cdlib.org/ark:/13030/kt7580382j',
          '/select-oac-id,dpla_mapper?mapper_type=oac_dc')
     mock_enrich_doc.assert_called_with(
         json.loads(doc_returned),
         '/select-oac-id,dpla_mapper?mapper_type=oac_dc', 8889)
Exemplo n.º 7
0
 def __init__(self, rq_queue=None):
     self._config = config()
     self._couchdb = get_couchdb()
     self._redis = Redis(
             host=self._config['redis_host'],
             port=self._config['redis_port'],
             password=self._config['redis_password'],
             socket_connect_timeout=self._config['redis_connect_timeout'])
     self.rqname = self._config['rq_queue']
     if rq_queue:
         self.rqname = rq_queue
     if not self.rqname:
         raise ValueError(''.join(('Must set RQ_QUEUE env var',
                                   ' or pass in rq_queue to ',
                                   'CouchDBJobEnqueue')))
     self._rQ = Queue(self.rqname, connection=self._redis)
Exemplo n.º 8
0
 def __init__(self, rq_queue=None):
     self._config = config()
     self._couchdb = get_couchdb()
     self._redis = Redis(
         host=self._config['redis_host'],
         port=self._config['redis_port'],
         password=self._config['redis_password'],
         socket_connect_timeout=self._config['redis_connect_timeout'])
     self.rqname = self._config['rq_queue']
     if rq_queue:
         self.rqname = rq_queue
     if not self.rqname:
         raise ValueError(''.join(
             ('Must set RQ_QUEUE env var', ' or pass in rq_queue to ',
              'CouchDBJobEnqueue')))
     self._rQ = Queue(self.rqname, connection=self._redis)
Exemplo n.º 9
0
 def __init__(
     self,
     cdb=None,
     url_couchdb=None,
     couchdb_name=None,
     couch_view=COUCHDB_VIEW,
     bucket_bases=BUCKET_BASES,
     object_auth=None,
     get_if_object=False,
     url_cache=None,
     hash_cache=None,
     harvested_object_cache=None,
 ):
     self._config = config()
     if cdb:
         self._couchdb = cdb
     else:
         if not url_couchdb:
             url_couchdb = self._config["couchdb_url"]
         self._couchdb = get_couchdb(url=url_couchdb, dbname=couchdb_name)
     self._bucket_bases = bucket_bases
     self._view = couch_view
     # auth is a tuple of username, password
     self._auth = object_auth
     self.get_if_object = get_if_object  # if object field exists, get
     self._redis = Redis(
         host=self._config["redis_host"],
         port=self._config["redis_port"],
         password=self._config["redis_password"],
         socket_connect_timeout=self._config["redis_connect_timeout"],
     )
     self._url_cache = (
         url_cache
         if url_cache is not None
         else redis_collections.Dict(key="ucldc-image-url-cache", redis=self._redis)
     )
     self._hash_cache = (
         hash_cache
         if hash_cache is not None
         else redis_collections.Dict(key="ucldc-image-hash-cache", redis=self._redis)
     )
     self._object_cache = (
         harvested_object_cache
         if harvested_object_cache
         else redis_collections.Dict(key="ucldc:harvester:harvested-images", redis=self._redis)
     )
Exemplo n.º 10
0
 def testMain(self, mock_enrich_doc):
     ''' main in enrich_existing_couchd_doc takes a doc _id and
     the enrichment chain to run. It then downloads the doc, submits it
     for enrichment and then saves the resulting document.
     '''
     conf = config()
     self.url_couch_base = conf['couchdb_url']
     self.cdb = conf['couchdb_dbname']
     url_couchdb = os.path.join(self.url_couch_base, self.cdb)
     httpretty.register_uri(httpretty.HEAD,
             url_couchdb,
             body='',
             content_length='0',
             content_type='text/plain; charset=utf-8',
             connection='close',
             server='CouchDB/1.5.0 (Erlang OTP/R16B03)',
             cache_control='must-revalidate',
             date='Mon, 24 Nov 2014 21:30:38 GMT'
             )
     url_doc = os.path.join(url_couchdb,
             '5112--http%3A%2F%2Fark.cdlib.org%2Fark%3A%2F13030%2Fkt7580382j')
     doc_returned = open(DIR_FIXTURES+'/couchdb_doc.json').read()
     httpretty.register_uri(httpretty.GET,
             url_doc,
             body=doc_returned,
             etag="2U5BW2TDDX9EHZJOO0DNE29D1",
             content_type='application/json',
             connection='close',
             )
     httpretty.register_uri(httpretty.PUT,
             url_doc,
             status=201,
             body='{"ok":true, "id":"5112--http://ark.cdlib.org/ark:/13030/kt7580382j", "rev":"123456789"}',
             content_type='application/json',
             etag="123456789",
             connection='close',
             )
     httpretty.register_uri(httpretty.POST,
             'http://localhost:8889/enrich',
             body=open(DIR_FIXTURES+'/akara_response.json').read(),
             )
     mock_enrich_doc.return_value = json.loads(doc_returned)
     main('5112--http://ark.cdlib.org/ark:/13030/kt7580382j',
         '/select-oac-id,dpla_mapper?mapper_type=oac_dc')
     mock_enrich_doc.assert_called_with(json.loads(doc_returned),
             '/select-oac-id,dpla_mapper?mapper_type=oac_dc', 8889)
Exemplo n.º 11
0
def get_couch_server(url=None, username=None, password=None):
    '''Returns a couchdb library Server object'''
    env = config()
    if not url:
        url = env['couchdb_url']
    if username is None:
        username = env.get('couchdb_username', None)
    if password is None:
        password = env.get('couchdb_password', None)
    if username:
        schema, uri = url.split("//")
        url = "{0}//{1}:{2}@{3}".format(schema, username, password, uri)
    py_version = sys.version_info
    if py_version.major == 2 and py_version.minor == 7 and py_version.micro > 8:
        #disable ssl verification
        import ssl
        ssl._create_default_https_context = ssl._create_unverified_context
    print "URL:{}".format(url)
    return couchdb.Server(url)
Exemplo n.º 12
0
def get_couch_server(url=None, username=None, password=None):
    '''Returns a couchdb library Server object'''
    env = config()
    if not url:
        url = env['couchdb_url']
    if username is None:
        username = env.get('couchdb_username', None)
    if password is None:
        password = env.get('couchdb_password', None)
    if username:
        schema, uri = url.split("//")
        url = "{0}//{1}:{2}@{3}".format(schema, username, password, uri)
    py_version = sys.version_info
    if py_version.major == 2 and py_version.minor == 7 and py_version.micro > 8:
        #disable ssl verification
        import ssl
        ssl._create_default_https_context = ssl._create_unverified_context
    print "URL:{}".format(url)
    return couchdb.Server(url)
Exemplo n.º 13
0
 def setUp(self):
     self.conf = config()
     self.url_couch_base = self.conf['couchdb_url']
     self.cdb = self.conf['couchdb_dbname']
     url_head = os.path.join(self.url_couch_base, self.cdb)
     httpretty.register_uri(httpretty.HEAD,
             url_head,
             body='',
             content_length='0',
             content_type='text/plain; charset=utf-8',
             connection='close',
             server='CouchDB/1.5.0 (Erlang OTP/R16B03)',
             cache_control='must-revalidate',
             date='Mon, 24 Nov 2014 21:30:38 GMT'
             )
     self._cdbworker = CouchDBWorker()
     def func_for_test(doc, *args, **kwargs):
         return doc, args, kwargs
     self.function = func_for_test
Exemplo n.º 14
0
    def setUp(self):
        self.conf = config()
        self.url_couch_base = self.conf['couchdb_url']
        self.cdb = self.conf['couchdb_dbname']
        url_head = os.path.join(self.url_couch_base, self.cdb)
        httpretty.register_uri(httpretty.HEAD,
                               url_head,
                               body='',
                               content_length='0',
                               content_type='text/plain; charset=utf-8',
                               connection='close',
                               server='CouchDB/1.5.0 (Erlang OTP/R16B03)',
                               cache_control='must-revalidate',
                               date='Mon, 24 Nov 2014 21:30:38 GMT')
        self._cdbworker = CouchDBWorker()

        def func_for_test(doc, *args, **kwargs):
            return doc, args, kwargs

        self.function = func_for_test
Exemplo n.º 15
0
 def __init__(self,
              cdb=None,
              url_couchdb=None,
              couchdb_name=None,
              couch_view=COUCHDB_VIEW,
              bucket_bases=BUCKET_BASES,
              object_auth=None,
              get_if_object=False,
              ignore_content_type=False,
              url_cache=None,
              hash_cache=None,
              harvested_object_cache=None):
     self._config = config()
     if cdb:
         self._couchdb = cdb
     else:
         if not url_couchdb:
             url_couchdb = self._config['couchdb_url']
         self._couchdb = get_couchdb(url=url_couchdb, dbname=couchdb_name)
     self._bucket_bases = bucket_bases
     self._view = couch_view
     # auth is a tuple of username, password
     self._auth = object_auth
     self.get_if_object = get_if_object  # if object field exists, get
     self.ignore_content_type = ignore_content_type  # Don't check content-type in headers
     self._redis = Redis(
         host=self._config['redis_host'],
         port=self._config['redis_port'],
         password=self._config['redis_password'],
         socket_connect_timeout=self._config['redis_connect_timeout'])
     self._url_cache = url_cache if url_cache is not None else \
         redis_collections.Dict(key='ucldc-image-url-cache',
                                redis=self._redis)
     self._hash_cache = hash_cache if hash_cache is not None else \
         redis_collections.Dict(key='ucldc-image-hash-cache',
                                redis=self._redis)
     self._object_cache = harvested_object_cache if harvested_object_cache \
         else \
         redis_collections.Dict(
             key='ucldc:harvester:harvested-images',
             redis=self._redis)
Exemplo n.º 16
0
 def __init__(self,
              cdb=None,
              url_couchdb=None,
              couchdb_name=None,
              couch_view=COUCHDB_VIEW,
              bucket_bases=BUCKET_BASES,
              object_auth=None,
              get_if_object=False,
              ignore_content_type=False,
              url_cache=None,
              hash_cache=None,
              harvested_object_cache=None):
     self._config = config()
     if cdb:
         self._couchdb = cdb
     else:
         if not url_couchdb:
             url_couchdb = self._config['couchdb_url']
         self._couchdb = get_couchdb(url=url_couchdb, dbname=couchdb_name)
     self._bucket_bases = bucket_bases
     self._view = couch_view
     # auth is a tuple of username, password
     self._auth = object_auth
     self.get_if_object = get_if_object  # if object field exists, get
     self.ignore_content_type = ignore_content_type # Don't check content-type in headers
     self._redis = Redis(
         host=self._config['redis_host'],
         port=self._config['redis_port'],
         password=self._config['redis_password'],
         socket_connect_timeout=self._config['redis_connect_timeout'])
     self._url_cache = url_cache if url_cache is not None else \
         redis_collections.Dict(key='ucldc-image-url-cache',
                                redis=self._redis)
     self._hash_cache = hash_cache if hash_cache is not None else \
         redis_collections.Dict(key='ucldc-image-hash-cache',
                                redis=self._redis)
     self._object_cache = harvested_object_cache if harvested_object_cache \
         else \
         redis_collections.Dict(
             key='ucldc:harvester:harvested-images',
             redis=self._redis)
import requests
from harvester.config import config
from harvester.scripts.queue_harvest import main as queue_harvest
env=config()
c_prod=[]
c_harvest=[]
url_reg = "https://registry.cdlib.org"
url_reg_api = '{}{}'.format(url_reg, "/api/v1/collection/")
url='{}{}'.format(url_reg_api, "?format=json&limit=1000")
resp=requests.get(url)
api=resp.json()
nextpage=api['meta']['next']
print "NEXTPAGE:{}".format(nextpage)
while nextpage:
    for o in api['objects']:
        if o['ready_for_publication']:
            c_prod.append(o)
            url_api_collection = '{}{}/'.format(url_reg_api, o['id'])
            print url_api_collection
            queue_harvest('*****@*****.**', url_api_collection,
                    redis_host=env['redis_host'],
                    redis_port=env['redis_port'],
                    redis_pswd=env['redis_password'],
                    rq_queue='normal-production')
        if o['url_harvest']:
            c_harvest.append(o)
    resp = requests.get(''.join(('https://registry.cdlib.org', nextpage)))
    api = resp.json()
    nextpage=api['meta']['next']
    print "NEXTPAGE:{}".format(nextpage)
'''one time script to populate redis with harvested image object data'''
from harvester.config import config
from harvester.couchdb_init import get_couchdb
from harvester.couchdb_pager import couchdb_pager
from redis import Redis
import redis_collections

_config = config()

_redis = Redis(host=_config['redis_host'],
               port=_config['redis_port'],
               password=_config['redis_password'],
               socket_connect_timeout=_config['redis_connect_timeout'])

object_cache = redis_collections.Dict(key='ucldc:harvester:harvested-images',
                                      redis=_redis)

_couchdb = get_couchdb(url=_config['couchdb_url'], dbname='ucldc')
v = couchdb_pager(_couchdb, include_docs='true')
for r in v:
    doc = r.doc
    if 'object' in doc:
        did = doc['_id']
        if 'object_dimensions' not in doc:
            print "NO DIMS for {} -- not caching".format(did)
        else:
            object_cache[did] = [doc['object'], doc['object_dimensions']]
            print "OBJECT CACHE : {} === {}".format(did, object_cache[did])
Exemplo n.º 19
0
from harvester.collection_registry_client import ResourceIterator
from harvester.collection_registry_client import url_base, api_path
from harvester.config import config
from harvester.scripts.queue_harvest import main as queue_harvest

for c in ResourceIterator(url_base, api_path + 'collection', 'collection'):
    if c.harvest_type != 'X':
        print c.name, c.slug, c.harvest_type, c.url_harvest
        env = config()
        queue_harvest('*****@*****.**',
                      url_base + c.resource_uri,
                      redis_host=env['redis_host'],
                      redis_port=env['redis_port'],
                      redis_pswd=env['redis_password'],
                      id_ec2_ingest=env['id_ec2_ingest'],
                      id_ec2_solr=env['id_ec2_solr_build'],
                      job_timeout=6000)
'''one time script to populate redis with harvested image object data'''
from harvester.config import config
from harvester.couchdb_init import get_couchdb
from harvester.couchdb_pager import couchdb_pager
from redis import Redis
import redis_collections

_config = config()

_redis = Redis(host=_config['redis_host'],
               port=_config['redis_port'],
               password=_config['redis_password'],
               socket_connect_timeout=_config['redis_connect_timeout'])

object_cache = redis_collections.Dict(key='ucldc:harvester:harvested-images',
                        redis=_redis)


_couchdb = get_couchdb(url=_config['couchdb_url'], dbname='ucldc')
v = couchdb_pager(_couchdb, include_docs='true')
for r in v:
    doc = r.doc
    if 'object' in doc:
        did = doc['_id']
        if 'object_dimensions' not in doc:
            print "NO DIMS for {} -- not caching".format(did)
        else:
            object_cache[did] = [doc['object'], doc['object_dimensions']]
            print "OBJECT CACHE : {} === {}".format(did, object_cache[did])