示例#1
0
 def by_collection(self, collection_key=None):
     '''If collection_key is none, trying to grab all of the images. (Not
     recommended)
     '''
     if collection_key:
         v = couchdb_pager(
             self._couchdb,
             view_name=self._view,
             startkey='"{0}"'.format(collection_key),
             endkey='"{0}"'.format(collection_key),
             include_docs='true')
     else:
         # use _all_docs view
         v = couchdb_pager(self._couchdb, include_docs='true')
     doc_ids = []
     report_errors = defaultdict(list)
     for r in v:
         dt_start = dt_end = datetime.datetime.now()
         try:
             reports = self.harvest_image_for_doc(r.doc)
         except ImageHarvestError as e:
             report_errors[e.dict_key].append((e.doc_id, str(e)))
         doc_ids.append(r.doc['_id'])
         dt_end = datetime.datetime.now()
         time.sleep((dt_end - dt_start).total_seconds())
     report_list = [
         ' : '.join((key, str(val))) for key, val in report_errors.items()
     ]
     report_msg = '\n'.join(report_list)
     subject = format_results_subject(collection_key,
                                      'Image harvest to CouchDB {env}')
     publish_to_harvesting(subject, ''.join(
         ('Processed {} documents\n'.format(len(doc_ids)), report_msg)))
     return doc_ids, report_errors
示例#2
0
 def by_collection(self, collection_key=None):
     """If collection_key is none, trying to grab all of the images. (Not
     recommended)
     """
     if collection_key:
         v = couchdb_pager(
             self._couchdb,
             view_name=self._view,
             startkey='"{0}"'.format(collection_key),
             endkey='"{0}"'.format(collection_key),
             include_docs="true",
         )
     else:
         # use _all_docs view
         v = couchdb_pager(self._couchdb, include_docs="true")
     doc_ids = []
     for r in v:
         dt_start = dt_end = datetime.datetime.now()
         reports = self.harvest_image_for_doc(r.doc)
         doc_ids.append(r.doc["_id"])
         dt_end = datetime.datetime.now()
         time.sleep((dt_end - dt_start).total_seconds())
     publish_to_harvesting(
         "Image harvested {}".format(collection_key), "Processed {} documents".format(len(doc_ids))
     )
     return doc_ids
示例#3
0
def main(collection_key=None, url_couchdb=SERVER_COUCHDB):
    '''If collection_key is none, trying to grab all of the images. (Not
    recommended)
    '''
    s = couchdb.Server(url=url_couchdb)
    db = s[DB_COUCHDB]
    #v = db.view(COUCH_VIEW, include_docs='true', key=collection_key) if collection_key else db.view(COUCH_VIEW, include_docs='true')
    v = couchdb_pager(
        db, view_name=COUCH_VIEW, include_docs='true',
        key=collection_key) if collection_key else couchdb_pager(
            db, view_name=COUCH_VIEW, include_docs='true')
    for r in v:
        doc = r.doc
        msg = doc['_id']
        if 's3://' in doc.get('object', ''):  #already downloaded
            msg = ' '.join((msg, 'already fetched image'))
            continue
        try:
            doc['isShownBy'] = doc.get('isShownBy', get_isShownBy(doc))
        except Exception, e:
            print("ERROR: Can't get isShownBy for {} : {}".format(
                doc['_id'], e))
            continue  #next doc
        try:
            url_image = doc['isShownBy']['src']
            dt_start = dt_end = datetime.datetime.now()
            report = md5s3stash(url_image, bucket_base=BUCKET_BASE)
            dt_end = datetime.datetime.now()
            doc['object'] = report.s3_url
            db.save(doc)
            msg = ' '.join((msg, doc['object']))
        except KeyError, e:
            msg = ' '.join((msg, "ERROR: No isShownBy field"))
示例#4
0
 def by_collection(self, collection_key=None):
     '''If collection_key is none, trying to grab all of the images. (Not
     recommended)
     '''
     if collection_key:
         v = couchdb_pager(self._couchdb,
                           view_name=self._view,
                           startkey='"{0}"'.format(collection_key),
                           endkey='"{0}"'.format(collection_key),
                           include_docs='true')
     else:
         # use _all_docs view
         v = couchdb_pager(self._couchdb, include_docs='true')
     doc_ids = []
     report_errors = defaultdict(list)
     for r in v:
         dt_start = dt_end = datetime.datetime.now()
         try:
             reports = self.harvest_image_for_doc(r.doc)
         except ImageHarvestError as e:
             report_errors[e.dict_key].append((e.doc_id, str(e)))
         doc_ids.append(r.doc['_id'])
         dt_end = datetime.datetime.now()
         time.sleep((dt_end - dt_start).total_seconds())
     report_list = [
         ' : '.join((key, str(val))) for key, val in report_errors.items()
     ]
     report_msg = '\n'.join(report_list)
     subject = format_results_subject(collection_key,
                                      'Image harvest to CouchDB {env}')
     publish_to_harvesting(
         subject, ''.join(
             ('Processed {} documents\n'.format(len(doc_ids)), report_msg)))
     return doc_ids, report_errors
示例#5
0
def main(collection_key=None, url_couchdb=SERVER_COUCHDB):
    '''If collection_key is none, trying to grab all of the images. (Not
    recommended)
    '''
    s = couchdb.Server(url=url_couchdb)
    db = s[DB_COUCHDB]
    #v = db.view(COUCH_VIEW, include_docs='true', key=collection_key) if collection_key else db.view(COUCH_VIEW, include_docs='true')
    v = couchdb_pager(db, view_name=COUCH_VIEW, include_docs='true', key=collection_key) if collection_key else couchdb_pager(db, view_name=COUCH_VIEW, include_docs='true')
    for r in v:
        doc = r.doc
        msg = doc['_id']
        if 's3://' in doc.get('object', ''): #already downloaded
            msg = ' '.join((msg, 'already fetched image'))
            continue
        try:
            doc['isShownBy'] = doc.get('isShownBy', get_isShownBy(doc))
        except Exception, e:
            print("ERROR: Can't get isShownBy for {} : {}".format(doc['_id'], e))
            continue #next doc
        try:
            url_image = doc['isShownBy']['src']
            dt_start = dt_end = datetime.datetime.now()
            report = md5s3stash(url_image, bucket_base=BUCKET_BASE)
            dt_end = datetime.datetime.now()
            doc['object'] = report.s3_url
            db.save(doc)
            msg = ' '.join((msg, doc['object']))
        except KeyError, e:
            msg = ' '.join((msg, "ERROR: No isShownBy field"))
示例#6
0
def main(url_solr=URL_SOLR, url_couchdb=None, couchdb_db=None):
    solr_db = solr.Solr(url_solr)
    db = get_couchdb(url=url_couchdb, dbname=couchdb_db)
    v = couchdb_pager(db, include_docs='true')
    # update or create new solr doc for each couchdb doc
    for r in v:
        doc_couch = r.doc
        if '_design' not in doc_couch['_id']:
            try:
                if not isinstance(doc_couch['originalRecord']['collection'],
                                  list):
                    doc_couch['originalRecord']['collection'] = [
                        doc_couch['originalRecord']['collection'],
                    ]
                    print("orgRec.Collection: {}".format(
                        doc_couch['sourceResource']['collection']))
            except KeyError:
                pass
            try:
                if not isinstance(doc_couch['sourceResource']['collection'],
                                  list):
                    doc_couch['sourceResource']['collection'] = [
                        doc_couch['sourceResource']['collection'],
                    ]
                    print("srcRes.Collection: {}".format(
                        doc_couch['sourceResource']['subject']))
            except KeyError:
                pass
            try:
                subject = doc_couch['sourceResource'].get('subject', None)
                if not isinstance(subject, list):
                    subject = [subject]
                subjects_norm = []
                for sub in subject:
                    if not isinstance(sub, dict):
                        subjects_norm.append({'name': sub})
                    else:
                        subjects_norm.append(sub)
                doc_couch['sourceResource']['subject'] = subjects_norm
            except KeyError:
                pass
            db.save(doc_couch)
            try:
                doc_solr = push_doc_to_solr(map_couch_to_solr_doc(doc_couch),
                                            solr_db=solr_db)
                print("PUSHED {} to solr".format(doc_couch['_id']))
            except TypeError:
                pass
    solr_db.commit()
示例#7
0
def main(url_solr=URL_SOLR, url_couchdb=None, couchdb_db=None):
    solr_db = solr.Solr(url_solr)
    db = get_couchdb(url=url_couchdb, dbname=couchdb_db)
    v = couchdb_pager(db, include_docs='true')
    # update or create new solr doc for each couchdb doc
    for r in v:
        doc_couch = r.doc
        if '_design' not in doc_couch['_id']:
            try:
                if not isinstance(doc_couch['originalRecord']['collection'], list):
                    doc_couch['originalRecord']['collection'] = [
                                    doc_couch['originalRecord']['collection'],
                                    ]
                    print("orgRec.Collection: {}".format(doc_couch['sourceResource']['collection']))
            except KeyError:
                pass
            try:
                if not isinstance(doc_couch['sourceResource']['collection'], list):
                    doc_couch['sourceResource']['collection'] = [
                                    doc_couch['sourceResource']['collection'],
                                    ]
                    print("srcRes.Collection: {}".format(doc_couch['sourceResource']['subject']))
            except KeyError:
                pass
            try:
                subject = doc_couch['sourceResource'].get('subject', None)
                if not isinstance(subject, list):
                    subject = [subject]
                subjects_norm = []
                for sub in subject:
                    if not isinstance(sub, dict):
                        subjects_norm.append({'name': sub})
                    else:
                        subjects_norm.append(sub)
                doc_couch['sourceResource']['subject'] = subjects_norm
            except KeyError:
                pass
            db.save(doc_couch)
            try:
                doc_solr = push_doc_to_solr(map_couch_to_solr_doc(doc_couch),
                                        solr_db=solr_db)
                print("PUSHED {} to solr".format(doc_couch['_id']))
            except TypeError:
                pass
    solr_db.commit()
示例#8
0
 def __init__(self,
              collection_key=None,
              couchdb_obj=None,
              url_couchdb=None,
              couchdb_name=None,
              couch_view=COUCHDB_VIEW,
              include_docs=True):
     if not collection_key:
         collection_key = '{}'
     if couchdb_obj is None:
         if not url_couchdb or not couchdb_name:
             raise ValueError('Need url and name to couch database')
         self._couchdb = get_couchdb(url=url_couchdb, dbname=couchdb_name)
     else:
         self._couchdb = couchdb_obj
     self._view = couch_view
     self._view_iter = couchdb_pager(
         self._couchdb,
         self._view,
         key=collection_key,
         include_docs='true' if include_docs else 'false')
示例#9
0
 def __init__(self,
              collection_key=None,
              couchdb_obj=None,
              url_couchdb=None,
              couchdb_name=None,
              couch_view=COUCHDB_VIEW,
              include_docs=True
              ):
     if not collection_key:
         collection_key = '{}'
     if couchdb_obj is None:
         if not url_couchdb or not couchdb_name:
             raise ValueError('Need url and name to couch database')
         self._couchdb = get_couchdb(url=url_couchdb, dbname=couchdb_name)
     else:
         self._couchdb = couchdb_obj
     self._view = couch_view
     self._view_iter = couchdb_pager(
             self._couchdb, self._view,
             key=collection_key,
             include_docs='true' if include_docs else 'false')
'''one time script to populate redis with harvested image object data'''
from harvester.config import config
from harvester.couchdb_init import get_couchdb
from harvester.couchdb_pager import couchdb_pager
from redis import Redis
import redis_collections

_config = config()

_redis = Redis(host=_config['redis_host'],
               port=_config['redis_port'],
               password=_config['redis_password'],
               socket_connect_timeout=_config['redis_connect_timeout'])

object_cache = redis_collections.Dict(key='ucldc:harvester:harvested-images',
                                      redis=_redis)

_couchdb = get_couchdb(url=_config['couchdb_url'], dbname='ucldc')
v = couchdb_pager(_couchdb, include_docs='true')
for r in v:
    doc = r.doc
    if 'object' in doc:
        did = doc['_id']
        if 'object_dimensions' not in doc:
            print "NO DIMS for {} -- not caching".format(did)
        else:
            object_cache[did] = [doc['object'], doc['object_dimensions']]
            print "OBJECT CACHE : {} === {}".format(did, object_cache[did])
'''one time script to populate redis with harvested image object data'''
from harvester.config import config
from harvester.couchdb_init import get_couchdb
from harvester.couchdb_pager import couchdb_pager
from redis import Redis
import redis_collections

_config = config()

_redis = Redis(host=_config['redis_host'],
               port=_config['redis_port'],
               password=_config['redis_password'],
               socket_connect_timeout=_config['redis_connect_timeout'])

object_cache = redis_collections.Dict(key='ucldc:harvester:harvested-images',
                        redis=_redis)


_couchdb = get_couchdb(url=_config['couchdb_url'], dbname='ucldc')
v = couchdb_pager(_couchdb, include_docs='true')
for r in v:
    doc = r.doc
    if 'object' in doc:
        did = doc['_id']
        if 'object_dimensions' not in doc:
            print "NO DIMS for {} -- not caching".format(did)
        else:
            object_cache[did] = [doc['object'], doc['object_dimensions']]
            print "OBJECT CACHE : {} === {}".format(did, object_cache[did])