Пример #1
0
 def by_collection(self, collection_key=None):
     '''If collection_key is none, trying to grab all of the images. (Not
     recommended)
     '''
     if collection_key:
         v = couchdb_pager(
             self._couchdb,
             view_name=self._view,
             startkey='"{0}"'.format(collection_key),
             endkey='"{0}"'.format(collection_key),
             include_docs='true')
     else:
         # use _all_docs view
         v = couchdb_pager(self._couchdb, include_docs='true')
     doc_ids = []
     report_errors = defaultdict(list)
     for r in v:
         dt_start = dt_end = datetime.datetime.now()
         try:
             reports = self.harvest_image_for_doc(r.doc)
         except ImageHarvestError as e:
             report_errors[e.dict_key].append((e.doc_id, str(e)))
         doc_ids.append(r.doc['_id'])
         dt_end = datetime.datetime.now()
         time.sleep((dt_end - dt_start).total_seconds())
     report_list = [
         ' : '.join((key, str(val))) for key, val in report_errors.items()
     ]
     report_msg = '\n'.join(report_list)
     subject = format_results_subject(collection_key,
                                      'Image harvest to CouchDB {env}')
     publish_to_harvesting(subject, ''.join(
         ('Processed {} documents\n'.format(len(doc_ids)), report_msg)))
     return doc_ids, report_errors
Пример #2
0
 def by_collection(self, collection_key=None):
     '''If collection_key is none, trying to grab all of the images. (Not
     recommended)
     '''
     if collection_key:
         v = couchdb_pager(self._couchdb,
                           view_name=self._view,
                           startkey='"{0}"'.format(collection_key),
                           endkey='"{0}"'.format(collection_key),
                           include_docs='true')
     else:
         # use _all_docs view
         v = couchdb_pager(self._couchdb, include_docs='true')
     doc_ids = []
     report_errors = defaultdict(list)
     for r in v:
         dt_start = dt_end = datetime.datetime.now()
         try:
             reports = self.harvest_image_for_doc(r.doc)
         except ImageHarvestError as e:
             report_errors[e.dict_key].append((e.doc_id, str(e)))
         doc_ids.append(r.doc['_id'])
         dt_end = datetime.datetime.now()
         time.sleep((dt_end - dt_start).total_seconds())
     report_list = [
         ' : '.join((key, str(val))) for key, val in report_errors.items()
     ]
     report_msg = '\n'.join(report_list)
     subject = format_results_subject(collection_key,
                                      'Image harvest to CouchDB {env}')
     publish_to_harvesting(
         subject, ''.join(
             ('Processed {} documents\n'.format(len(doc_ids)), report_msg)))
     return doc_ids, report_errors
Пример #3
0
 def by_collection(self, collection_key=None):
     """If collection_key is none, trying to grab all of the images. (Not
     recommended)
     """
     if collection_key:
         v = couchdb_pager(
             self._couchdb,
             view_name=self._view,
             startkey='"{0}"'.format(collection_key),
             endkey='"{0}"'.format(collection_key),
             include_docs="true",
         )
     else:
         # use _all_docs view
         v = couchdb_pager(self._couchdb, include_docs="true")
     doc_ids = []
     for r in v:
         dt_start = dt_end = datetime.datetime.now()
         reports = self.harvest_image_for_doc(r.doc)
         doc_ids.append(r.doc["_id"])
         dt_end = datetime.datetime.now()
         time.sleep((dt_end - dt_start).total_seconds())
     publish_to_harvesting(
         "Image harvested {}".format(collection_key), "Processed {} documents".format(len(doc_ids))
     )
     return doc_ids
Пример #4
0
def delete_solr_item_by_id(item_id):
    url_solr = os.environ['URL_SOLR']
    body = 'stream.body=<delete><id>{}</id></delete>'.format(item_id)
    url_delete = '{}/update?{}&commit=true'.format(url_solr, body)
    response = requests.get(url_delete)
    response.raise_for_status()
    subject = format_results_subject(item_id,
                                     'Deleted document from Solr {env} ')
    publish_to_harvesting(subject, 'DELETED {}'.format(item_id))
Пример #5
0
def delete_collection(cid):
    print >> sys.stderr, "DELETING COLLECTION: {}".format(cid)
    _couchdb = get_couchdb()
    rows = CouchDBCollectionFilter(collection_key=cid, couchdb_obj=_couchdb)
    ids = [row['id'] for row in rows]
    num_deleted, deleted_docs = delete_id_list(ids, _couchdb=_couchdb)
    publish_to_harvesting(
        'Deleted CouchDB Collection {}'.format(cid),
        'Deleted {} documents from CouchDB collection {}'.format(num_deleted,
                                                                 cid))
    return num_deleted, deleted_docs
Пример #6
0
def delete_solr_collection(collection_key):
    '''Delete a solr  collection for the environment'''
    url_solr = os.environ['URL_SOLR']
    COLLECTION_URL_FORMAT = 'https://registry.cdlib.org/api/v1/collection/{}/'
    collection_url = COLLECTION_URL_FORMAT.format(collection_key)
    query = 'stream.body=<delete><query>collection_url:\"{}\"</query>' \
            '</delete>&commit=true'.format(collection_url)
    url_delete = '{}/update?{}'.format(url_solr, query)
    response = requests.get(url_delete)
    response.raise_for_status()
    publish_to_harvesting('Deleted solr collection {}'.format(collection_key),
                          'DELETED {}'.format(collection_key))
def delete_collection(cid):
    print >> sys.stderr, "DELETING COLLECTION: {}".format(cid)
    _couchdb = get_couchdb()
    rows = CouchDBCollectionFilter(collection_key=cid, couchdb_obj=_couchdb)
    ids = [row['id'] for row in rows]
    num_deleted, deleted_docs = delete_id_list(ids, _couchdb=_couchdb)
    subject = format_results_subject(cid,
                                     'Deleted documents from CouchDB {env} ')
    publish_to_harvesting(
        subject, 'Deleted {} documents from CouchDB collection CID: {}'.format(
            num_deleted, cid))
    return num_deleted, deleted_docs
def main(url_remote_couchdb, url_api_collection):
    '''Update to the current environment's couchdb a remote couchdb collection
    '''
    collection = Collection(url_api_collection)
    total, updated, created = update_collection_from_remote(
        url_remote_couchdb, url_api_collection)
    msg = 'Synced {} documents to production for CouchDB collection {}'.format(
        total, collection.id)
    msg += '\nUpdated {} documents, created {} documents.'.format(
        updated, created)
    publish_to_harvesting('Synced CouchDB Collection {}'.format(collection.id),
                          msg)
Пример #9
0
def delete_solr_collection(collection_key):
    '''Delete a solr  collection for the environment'''
    url_solr = os.environ['URL_SOLR']
    COLLECTION_URL_FORMAT = 'https://registry.cdlib.org/api/v1/collection/{}/'
    collection_url = COLLECTION_URL_FORMAT.format(collection_key)
    query = 'stream.body=<delete><query>collection_url:\"{}\"</query>' \
            '</delete>&commit=true'.format(collection_url)
    url_delete = '{}/update?{}'.format(url_solr, query)
    response = requests.get(url_delete)
    response.raise_for_status()
    subject = format_results_subject(collection_key,
                                     'Deleted documents from Solr {env} ')
    publish_to_harvesting(subject, 'DELETED {}'.format(collection_key))
def update_couch_docs_by_collection(cid, fieldName, newValue):
    print >> sys.stderr, "UPDATING DOCS FOR COLLECTION: {}".format(cid)
    _couchdb = get_couchdb()
    rows = CouchDBCollectionFilter(collection_key=cid, couchdb_obj=_couchdb)
    ids = [row['id'] for row in rows]
    num_updated, updated_docs = update_by_id_list(
        ids, fieldName, newValue, _couchdb=_couchdb)
    subject = format_results_subject(cid,
                                     'Updated documents from CouchDB {env} ')
    publish_to_harvesting(
        subject, 'Updated {} documents from CouchDB collection CID: {}'.format(
            num_updated, cid))
    return num_updated, updated_docs
Пример #11
0
def main(url_remote_couchdb, url_api_collection):
    '''Update to the current environment's couchdb a remote couchdb collection
    '''
    collection = Collection(url_api_collection)
    total, updated, created = update_collection_from_remote(
        url_remote_couchdb, url_api_collection)
    msg = 'Synced {} documents to production for CouchDB collection {}'.format(
        total,
        collection.id)
    msg += '\nUpdated {} documents, created {} documents.'.format(
        updated,
        created)
    publish_to_harvesting(
        'Synced CouchDB Collection {}'.format(collection.id),
        msg)
def update_couch_docs_by_collection(cid, fieldName, newValue, substring):
    print >> sys.stderr, "UPDATING DOCS FOR COLLECTION: {}".format(cid)
    _couchdb = get_couchdb()
    rows = CouchDBCollectionFilter(collection_key=cid, couchdb_obj=_couchdb)
    ids = [row['id'] for row in rows]
    num_updated, updated_docs = update_by_id_list(ids,
                                                  fieldName,
                                                  newValue,
                                                  substring,
                                                  _couchdb=_couchdb)
    subject = format_results_subject(cid,
                                     'Updated documents from CouchDB {env} ')
    publish_to_harvesting(
        subject, 'Updated {} documents from CouchDB collection CID: {}'.format(
            num_updated, cid))
    return num_updated, updated_docs
Пример #13
0
 def execute_job(self, job, queue):
     """Spawns a work horse to perform the actual work and passes it a job.
     The worker will wait for the work horse and make sure it executes
     within the given timeout bounds, or will end the work horse with
     SIGALRM.
     """
     worker_name = (self.key.rsplit(':', 1)[1]).rsplit('.', 1)[0]
     subject, msg = create_execute_job_message("Started", worker_name, job)
     logging.info(msg)
     publish_to_harvesting(subject, msg)
     self.set_state('busy')
     self.fork_work_horse(job, queue)
     self.monitor_work_horse(job)
     subject, msg = create_execute_job_message("Completed", worker_name,
                                               job)
     logging.info(msg)
     publish_to_harvesting(subject, msg)
     self.set_state('idle')
Пример #14
0
def sync_couch_collection_to_solr(collection_key):
    # This works from inside an environment with default URLs for couch & solr
    URL_SOLR = os.environ.get('URL_SOLR', None)
    collection_key = str(collection_key)  # Couch need string keys
    v = CouchDBCollectionFilter(
        couchdb_obj=get_couchdb(), collection_key=collection_key)
    solr_db = Solr(URL_SOLR)
    updated_docs = []
    num_added = 0
    report = defaultdict(int)
    for r in v:
        try:
            fill_in_title(r.doc)
            has_required_fields(r.doc)
        except KeyError as e:
            report[e.dict_key] += 1
            print(e.message, file=sys.stderr)
            continue
        except ValueError as e:
            report[e.dict_key] += 1
            print(e.message, file=sys.stderr)
            continue
        solr_doc = map_couch_to_solr_doc(r.doc)
        # TODO: here is where to check if existing and compare collection vals
        try:
            check_nuxeo_media(solr_doc)
        except ValueError as e:
            print(e.message, file=sys.stderr)
            report[e.dict_key] += 1
            continue
        updated_docs.append(solr_doc)
        num_added += push_doc_to_solr(solr_doc, solr_db=solr_db)
    solr_db.commit()
    publish_to_harvesting(
        'Synced collection {} to solr'.format(collection_key),
        harvesting_report(
            collection_key,
            updated_docs,
            num_added,
            report))
    return updated_docs, report
Пример #15
0
def sync_couch_collection_to_solr(collection_key):
    # This works from inside an environment with default URLs for couch & solr
    delete_solr_collection(collection_key)
    URL_SOLR = os.environ.get('URL_SOLR', None)
    collection_key = str(collection_key)  # Couch need string keys
    v = CouchDBCollectionFilter(couchdb_obj=get_couchdb(),
                                collection_key=collection_key)
    solr_db = Solr(URL_SOLR)
    updated_docs = []
    num_added = 0
    report = defaultdict(int)
    for r in v:
        try:
            fill_in_title(r.doc)
            has_required_fields(r.doc)
        except KeyError as e:
            report[e.dict_key] += 1
            print(e.message, file=sys.stderr)
            continue
        except ValueError as e:
            report[e.dict_key] += 1
            print(e.message, file=sys.stderr)
            continue
        solr_doc = map_couch_to_solr_doc(r.doc)
        # TODO: here is where to check if existing and compare collection vals
        try:
            check_nuxeo_media(solr_doc)
        except ValueError as e:
            print(e.message, file=sys.stderr)
            report[e.dict_key] += 1
            continue
        updated_docs.append(solr_doc)
        num_added += push_doc_to_solr(solr_doc, solr_db=solr_db)
    solr_db.commit()
    publish_to_harvesting(
        'Synced collection {} to solr'.format(collection_key),
        harvesting_report(collection_key, updated_docs, num_added, report))
    return updated_docs, report
Пример #16
0
        couchdb_obj=get_couchdb(), collection_key=collection_key)
    solr_db = Solr(URL_SOLR)
    results = []
    for r in v:
        try:
            fill_in_title(r.doc)
            has_required_fields(r.doc)
        except KeyError, e:
            print(e.message)
            continue
        solr_doc = map_couch_to_solr_doc(r.doc)
        results.append(solr_doc)
        solr_doc = push_doc_to_solr(solr_doc, solr_db=solr_db)
    solr_db.commit()
    publish_to_harvesting(
        'Synced collection {} to solr'.format(collection_key),
        '{} documents updated'.format(len(results)))
    return results


def main(url_couchdb=None,
         dbname=None,
         url_solr=None,
         all_docs=False,
         since=None):
    '''Use the _changes feed with a "since" parameter to only catch new
    changes to docs. The _changes feed will only have the *last* event on
    a document and does not retain intermediate changes.
    Setting the "since" to 0 will result in getting a _changes record for
    each document, essentially dumping the db to solr
    '''
Пример #17
0
def main(user_email,
         url_api_collection,
         log_handler=None,
         mail_handler=None,
         dir_profile='profiles',
         profile_path=None,
         config_file=None,
         redis_host=None,
         redis_port=None,
         redis_pswd=None,
         redis_timeout=600,
         rq_queue=None,
         run_image_harvest=False,
         **kwargs):
    '''Runs a UCLDC ingest process for the given collection'''
    cleanup_work_dir()  # remove files from /tmp
    emails = [user_email]
    if EMAIL_SYS_ADMIN:
        emails.extend([u for u in EMAIL_SYS_ADMIN.split(',')])
    if not mail_handler:
        mail_handler = logbook.MailHandler(EMAIL_RETURN_ADDRESS,
                                           emails,
                                           level='ERROR',
                                           bubble=True)
    mail_handler.push_application()
    if not config_file:
        config_file = os.environ.get('DPLA_CONFIG_FILE', 'akara.ini')
    if not (redis_host and redis_port and redis_pswd):
        config = config_harvest(config_file=config_file)

    try:
        collection = Collection(url_api_collection)
    except Exception as e:
        msg = 'Exception in Collection {}, init {}'.format(
            url_api_collection, str(e))
        logbook.error(msg)
        raise e
    if not log_handler:
        log_handler = logbook.StderrHandler(level='DEBUG')

    log_handler.push_application()
    logger = logbook.Logger('run_ingest')
    ingest_doc_id, num_recs, dir_save, harvester = fetcher.main(
        emails,
        url_api_collection,
        log_handler=log_handler,
        mail_handler=mail_handler,
        **kwargs)
    if 'prod' in os.environ['DATA_BRANCH'].lower():
        if not collection.ready_for_publication:
            raise Exception(''.join(
                ('Collection {} is not ready for publication.',
                 ' Run on stage and QA first, then set',
                 ' ready_for_publication')).format(collection.id))
    logger.info("INGEST DOC ID:{0}".format(ingest_doc_id))
    logger.info('HARVESTED {0} RECORDS'.format(num_recs))
    logger.info('IN DIR:{0}'.format(dir_save))
    resp = enrich_records.main([None, ingest_doc_id])
    if not resp == 0:
        logger.error("Error enriching records {0}".format(resp))
        raise Exception('Failed during enrichment process: {0}'.format(resp))
    logger.info('Enriched records')

    resp = save_records.main([None, ingest_doc_id])
    if not resp >= 0:
        logger.error("Error saving records {0}".format(str(resp)))
        raise Exception("Error saving records {0}".format(str(resp)))
    num_saved = resp
    logger.info("SAVED RECS : {}".format(num_saved))

    resp = remove_deleted_records.main([None, ingest_doc_id])
    if not resp == 0:
        logger.error("Error deleting records {0}".format(resp))
        raise Exception("Error deleting records {0}".format(resp))

    resp = check_ingestion_counts.main([None, ingest_doc_id])
    if not resp == 0:
        logger.error("Error checking counts {0}".format(resp))
        raise Exception("Error checking counts {0}".format(resp))

    resp = dashboard_cleanup.main([None, ingest_doc_id])
    if not resp == 0:
        logger.error("Error cleaning up dashboard {0}".format(resp))
        raise Exception("Error cleaning up dashboard {0}".format(resp))
    subject = format_results_subject(collection.id,
                                     'Harvest to CouchDB {env} ')
    publish_to_harvesting(
        subject, 'Finished metadata harvest for CID: {}\n'
        'Fetched: {}\nSaved: {}'.format(collection.id, num_recs, num_saved))

    log_handler.pop_application()
    mail_handler.pop_application()
Пример #18
0
    resp = remove_deleted_records.main([None, ingest_doc_id])
    if not resp == 0:
        logger.error("Error deleting records {0}".format(resp))
        raise Exception("Error deleting records {0}".format(resp))

    resp = check_ingestion_counts.main([None, ingest_doc_id])
    if not resp == 0:
        logger.error("Error checking counts {0}".format(resp))
        raise Exception("Error checking counts {0}".format(resp))

    resp = dashboard_cleanup.main([None, ingest_doc_id])
    if not resp == 0:
        logger.error("Error cleaning up dashboard {0}".format(resp))
        raise Exception("Error cleaning up dashboard {0}".format(resp))

    publish_to_harvesting('Harvesting completed for {}'.format(collection.id),
                          'Finished harvest for {}'.format(collection.id))
    # the image_harvest should be a separate job, with a long timeout
    if run_image_harvest:
        job = queue_image_harvest(
            config['redis_host'],
            config['redis_port'],
            config['redis_password'],
            config['redis_connect_timeout'],
            config['couchdb_url'],
            collection.id,
            rq_queue,
            object_auth=collection.auth)
        logger.info("Started job for image_harvest:{}".format(job.result))

    log_handler.pop_application()
    mail_handler.pop_application()
Пример #19
0
def main(user_email,
         url_api_collection,
         log_handler=None,
         mail_handler=None,
         dir_profile='profiles',
         profile_path=None,
         config_file=None,
         redis_host=None,
         redis_port=None,
         redis_pswd=None,
         redis_timeout=600,
         rq_queue=None,
         run_image_harvest=False,
         **kwargs):
    '''Runs a UCLDC ingest process for the given collection'''
    cleanup_work_dir()  # remove files from /tmp
    emails = [user_email]
    if EMAIL_SYS_ADMIN:
        emails.extend([u for u in EMAIL_SYS_ADMIN.split(',')])
    if not mail_handler:
        mail_handler = logbook.MailHandler(
            EMAIL_RETURN_ADDRESS, emails, level='ERROR', bubble=True)
    mail_handler.push_application()
    if not config_file:
        config_file = os.environ.get('DPLA_CONFIG_FILE', 'akara.ini')
    if not (redis_host and redis_port and redis_pswd):
        config = config_harvest(config_file=config_file)

    try:
        collection = Collection(url_api_collection)
    except Exception as e:
        msg = 'Exception in Collection {}, init {}'.format(url_api_collection,
                                                           str(e))
        logbook.error(msg)
        raise e
    if not log_handler:
        log_handler = logbook.StderrHandler(level='DEBUG')

    log_handler.push_application()
    logger = logbook.Logger('run_ingest')
    ingest_doc_id, num_recs, dir_save, harvester = fetcher.main(
        emails,
        url_api_collection,
        log_handler=log_handler,
        mail_handler=mail_handler,
        **kwargs)
    if 'prod' in os.environ['DATA_BRANCH'].lower():
        if not collection.ready_for_publication:
            raise Exception(''.join(
                ('Collection {} is not ready for publication.',
                 ' Run on stage and QA first, then set',
                 ' ready_for_publication')).format(collection.id))
    logger.info("INGEST DOC ID:{0}".format(ingest_doc_id))
    logger.info('HARVESTED {0} RECORDS'.format(num_recs))
    logger.info('IN DIR:{0}'.format(dir_save))
    resp = enrich_records.main([None, ingest_doc_id])
    if not resp == 0:
        logger.error("Error enriching records {0}".format(resp))
        raise Exception('Failed during enrichment process: {0}'.format(resp))
    logger.info('Enriched records')

    resp = save_records.main([None, ingest_doc_id])
    if not resp >= 0:
        logger.error("Error saving records {0}".format(str(resp)))
        raise Exception("Error saving records {0}".format(str(resp)))
    num_saved = resp
    logger.info("SAVED RECS : {}".format(num_saved))

    resp = remove_deleted_records.main([None, ingest_doc_id])
    if not resp == 0:
        logger.error("Error deleting records {0}".format(resp))
        raise Exception("Error deleting records {0}".format(resp))

    resp = check_ingestion_counts.main([None, ingest_doc_id])
    if not resp == 0:
        logger.error("Error checking counts {0}".format(resp))
        raise Exception("Error checking counts {0}".format(resp))

    resp = dashboard_cleanup.main([None, ingest_doc_id])
    if not resp == 0:
        logger.error("Error cleaning up dashboard {0}".format(resp))
        raise Exception("Error cleaning up dashboard {0}".format(resp))
    subject = format_results_subject(collection.id,
                                     'Harvest to CouchDB {env} ')
    publish_to_harvesting(subject,
                          'Finished metadata harvest for CID: {}\n'
                          'Fetched: {}\nSaved: {}'.format(
                              collection.id,
                              num_recs,
                              num_saved))

    log_handler.pop_application()
    mail_handler.pop_application()
Пример #20
0
def exception_to_sns(job, *exc_info):
    '''Make an exception handler to report exceptions to SNS msg queue'''
    subject = 'FAILED: job {}'.format(job.description)
    message = 'ERROR: job {} failed\n{}'.format(job.description, exc_info[1])
    logging.error(message)
    publish_to_harvesting(subject, message)