示例#1
0
def update_bulk_download_document(provider, file_path, file_size):
    c = Couch()
    bulk_download_doc_id = c.update_bulk_download_document(
                            provider, file_path, file_size
                            )
    print "Updated bulk_download database document with ID %s" % \
          bulk_download_doc_id
def update_bulk_download_document(provider, file_path, file_size):
    c = Couch()
    bulk_download_doc_id = c.update_bulk_download_document(
                            provider, file_path, file_size
                            )
    print "Updated bulk_download database document with ID %s" % \
          bulk_download_doc_id
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    config_file = ("akara.ini")
    config = ConfigParser.ConfigParser()
    config.readfp(open(config_file))
    uri_base = "http://localhost:" + config.get("Akara", "Port")

    with open(args.profile_path, "r") as f:
        try:
            profile = json.load(f)
        except:
            print "Error, could not load profile in %s" % __name__
            return None
    provider = profile["name"]

    couch = Couch()
    latest_ingestion_doc = couch._get_last_ingestion_doc_for(provider)
    if latest_ingestion_doc and \
       getprop(latest_ingestion_doc, "delete_process/status") != "complete":
        error_msg = "Error, last ingestion did not complete. Review " + \
                    "dashboard document %s for errors." % \
                    latest_ingestion_doc["_id"]
        logger.error(error_msg)
        print error_msg
        return None

    ingestion_document_id = couch._create_ingestion_document(provider,
                                                             uri_base,
                                                             args.profile_path)
    logger.debug("Ingestion document %s created." % ingestion_document_id)

    return ingestion_document_id
示例#4
0
def profile_names_for_contributor(contributor):
    """Return a list of profile names that use the given contributor name"""
    profiles = {}
    couch = Couch()
    view = "export_database/profile_and_source_names"
    for row in couch.dpla_view(view, group=True):
        k = row["key"]
        profiles.setdefault(k[0], []).append(k[1])
    return profiles.get(contributor, [])
示例#5
0
def profile_names_for_contributor(contributor):
    """Return a list of profile names that use the given contributor name"""
    profiles = {}
    couch = Couch()
    view = "export_database/profile_and_source_names"
    for row in couch.dpla_view(view, group=True):
        k = row["key"]
        profiles.setdefault(k[0], []).append(k[1])
    return profiles.get(contributor, [])
示例#6
0
def item_docs(provider_name=None):
    """Yield all item documents for the given provider, else all providers"""
    couch = Couch()
    if provider_name:
        docs = couch._query_all_dpla_provider_docs(provider_name)
    else:
        docs = couch.all_dpla_docs()
    for doc in docs:
        if doc.get("ingestType") == "item":
            yield doc
示例#7
0
def item_docs(provider_name=None):
    """Yield all item documents for the given provider, else all providers"""
    couch = Couch()
    if provider_name:
        docs = couch._query_all_dpla_provider_docs(provider_name)
    else:
        docs = couch.all_dpla_docs()
    for doc in docs:
        if doc.get("ingestType") == "item":
            yield doc
示例#8
0
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])
    couch = Couch()
    database_names = ["dpla", "dashboard", "bulk_download"]
    if args.database_name in database_names:
        couch.sync_views(args.database_name)
    else:
        print >> sys.stderr, "The database_name parameter should be " + \
                             "either %s" % " or ".join(database_names)
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    couch = Couch()
    if args.database_name in ["dpla", "dashboard"]:
        couch._sync_views(args.database_name)
    else:
        print >> sys.stderr, "The database_name parameter should be " + \
                             "either \"dpla\" or \"dashboard\""
示例#10
0
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])
    couch = Couch(dpla_db_name=args.database_name,
                  dashboard_db_name='dashboard')
    database_names = ["dpla", "dashboard", "bulk_download", "ucldc"]
    if args.database_name in database_names:
        print "couch.sync_views(" + args.database_name + ") next!"
        couch.sync_views(args.database_name)
    else:
        print >> sys.stderr, "The database_name parameter should be " + \
                             "either \"dpla\" or \"dashboard\" or \"ucldc\"  \
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]
    if getprop(ingestion_doc, "delete_process/status") != "complete":
        print "Error, delete process did not complete"
        return -1

    # Update ingestion document
    kwargs = {
        "check_counts_process/status": "running",
        "check_counts_process/start_time": iso_utc_with_tz()
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    # Check each count against the threshold
    alerts = []
    count_type = ("Added", "Changed", "Deleted")
    for ctype in count_type:
        count = int(ingestion_doc["count" + ctype])
        threshold = int(ingestion_doc["thresholds"][ctype.lower()])
        if count > threshold:
            alerts.append("%s items %s exceeds threshold of %s" %
                          (count, ctype.lower(), threshold))

    error_msg = None
    if alerts:
        config_file = "akara.ini"
        config = ConfigParser.ConfigParser()
        config.readfp(open(config_file))
        to = [s.strip() for s in config.get("Alert", "To").split(",")]
        frm = config.get("Alert", "From")

        month = dateparser.parse(ingestion_doc["ingestDate"]).strftime("%B")
        alerts = "\n".join(alerts)
        msg = MIMEText(alerts)
        msg["Subject"] = "Threshold(s) exceeded for %s ingestion of %s" % \
                         (month, ingestion_doc["provider"])
        msg["To"] = ", ".join(to)
        msg["From"] = frm

        try:
            s = smtplib.SMTP("localhost")
            s.sendmail(frm, to, msg.as_string())
            s.quit()
        except Exception, e:
            error_msg = e
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]
    if getprop(ingestion_doc, "delete_process/status") != "complete":
        print "Error, delete process did not complete"
        return -1

    # Update ingestion document
    kwargs = {
        "check_counts_process/status": "running",
        "check_counts_process/start_time": iso_utc_with_tz()
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    # Check each count against the threshold
    alerts = []
    count_type = ("Added", "Changed", "Deleted")
    for ctype in count_type:
        count = int(ingestion_doc["count" + ctype])
        threshold = int(ingestion_doc["thresholds"][ctype.lower()])
        if count > threshold:
            alerts.append("%s items %s exceeds threshold of %s" %
                          (count, ctype.lower(), threshold))

    error_msg = None
    if alerts:
        config_file = "akara.ini"
        config = ConfigParser.ConfigParser()                                    
        config.readfp(open(config_file))
        to = [s.strip() for s in config.get("Alert", "To").split(",")]
        frm = config.get("Alert", "From")

        month = dateparser.parse(ingestion_doc["ingestDate"]).strftime("%B")
        alerts = "\n".join(alerts)
        msg = MIMEText(alerts)
        msg["Subject"] = "Threshold(s) exceeded for %s ingestion of %s" % \
                         (month, ingestion_doc["provider"])
        msg["To"] = ", ".join(to)
        msg["From"] = frm

        try:
            s = smtplib.SMTP("localhost")
            s.sendmail(frm, to, msg.as_string())
            s.quit()
        except Exception, e:
            error_msg = e
def main(argv):
    couch = Couch()
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    with open(args.profile_path, "r") as f:
        profile = json.load(f)

    provider = profile.get("name")
    if confirm_deletion(provider):
        couch._delete_all_provider_documents(provider)
    else:
        return False
示例#14
0
def main(argv):
    couch = Couch()
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    with open(args.profile_path, "r") as f:
        profile = json.load(f)

    provider = profile.get("name")
    if confirm_deletion(provider):
        couch._delete_all_provider_documents(provider)
    else:
        return False
示例#15
0
def download_each_source_data(arguments):
    """Gets a list of all sources in the Couch database and downloads each
       source's data.

       Arguments:
           arguments - dictionary returned by the validate_arguments function

    """
    couch = Couch()
    rows = couch.dpla_view("export_database/all_source_names", group=True)
    for row in rows:
        arguments["source"] = row["key"]
        print arguments["source"]
        download_data(arguments)
示例#16
0
def download_each_source_data(arguments):
    """Gets a list of all sources in the Couch database and downloads each
       source's data.

       Arguments:
           arguments - dictionary returned by the validate_arguments function

    """
    couch = Couch()
    rows = couch.dpla_view("export_database/all_source_names", group=True)
    for row in rows:
        arguments["source"] = row["key"]
        print arguments["source"]
        download_data(arguments)
示例#17
0
def get_enrich_dir(ingestion_document_id):
    couch = Couch()
    ingestion_doc = couch.dashboard_db[ingestion_document_id]

    if getprop(ingestion_doc, "enrich_process/status") != "complete":
        raise AssertionError(
            "Cannot save Avro files, enrich process did not complete")

    return getprop(ingestion_doc, "enrich_process/data_dir")
def main():
    couch = Couch()

    new_fields = {
        "fetch_process": {"status": "complete"},
        "enrich_process": {"status": "complete"},
        "save_process": {"status": "complete"},
        "delete_process": {"status": "complete"},
    }
    for profile in os.listdir("profiles/"):
        if profile.endswith(".pjs"):
            with open("profiles/" + profile, "r") as f:
                p = json.loads(f.read())

            provider = p["name"]

            for doc in couch._query_all_provider_ingestion_docs(provider):
                doc.update(new_fields)
                couch.dashboard_db.update([doc])
示例#19
0
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    couch = Couch()
    i_doc = couch.dashboard_db[args.ingestion_document_id]
    if i_doc['delete_process']['status'] != 'complete':
        print >> sys.stderr, 'Error: delete process did not complete'
        return 1

    # Update ingestion document to indicate that we're running
    kwargs = {
        'check_counts_process/status': 'running',
        'check_counts_process/start_time': iso_utc_with_tz()
    }
    try:
        couch.update_ingestion_doc(i_doc, **kwargs)
    except:
        tb = traceback.format_exc(5)
        print "Error updating ingestion document %s\n%s" % (i_doc["_id"], tb)
        return 1

    error_msg = None
    try:
        config = ConfigParser.ConfigParser()
        config.readfp(open('akara.ini'))
        to = [s.strip() for s in config.get('Alert', 'To').split(',')]
        frm = config.get('Alert', 'From')
        body = "%s\n\n%s" % (alerts(i_doc), statistics(i_doc))
        msg = MIMEText(body)
        msg['Subject'] = "%s ingest #%s" % (i_doc['provider'],
                                            i_doc['ingestionSequence'])
        msg['To'] = ', '.join(to)
        msg['From'] = frm
        s = smtplib.SMTP("localhost")
        s.sendmail(frm, to, msg.as_string())
        s.quit()
    except Exception, e:
        error_msg = e
        tb = traceback.format_exc(5)
        print >> sys.stderr, "Error sending alert email: %s\n%s" % (error_msg,
                                                                    tb)
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    config_file = os.environ.get("DPLA_CONFIG_FILE", "akara.ini")
    config = ConfigParser.ConfigParser()
    config.readfp(open(config_file))
    uri_base = "http://localhost:" + config.get("Akara", "Port")

    with open(args.profile_path, "r") as f:
        try:
            profile = json.load(f)
        except:
            print "Error, could not load profile in %s" % __name__
            return None
    provider = profile["name"]
    thresholds = profile["thresholds"]
    fetcher_threads = profile.get("fetcher_threads") or 1

    couch = Couch()
    latest_ingestion_doc = couch._get_last_ingestion_doc_for(provider)
    if (latest_ingestion_doc and
        getprop(latest_ingestion_doc,
                "dashboard_cleanup_process/status") != "complete"):
        error_msg = "Error, last ingestion did not complete. Review " + \
                    "dashboard document %s for errors." % \
                    latest_ingestion_doc["_id"]
        logger.error(error_msg)
        print error_msg
        return None

    ingestion_document_id = couch._create_ingestion_document(provider,
                                                             uri_base,
                                                             args.profile_path,
                                                             thresholds,
                                                             fetcher_threads)
    msg = "Ingestion document %s created." % ingestion_document_id
    logger.debug(msg)
    print msg

    return ingestion_document_id
def nara_update_links():
    couch = Couch()
    url = "http://research.archives.gov/description/"
    docs = []
    print >> sys.stderr, "Fetching all documents"
    count = 0
    start = time.time()
    for doc in couch._query_all_dpla_provider_docs("nara"):
        if count == 0:
            view_time = time.time() - start
            start = time.time()
        count += 1
        arc_id_desc = getprop(doc, "originalRecord/arc-id-desc",
                              keyErrorAsNone=True)
        if arc_id_desc:
            doc.update({"isShownAt": url + arc_id_desc})
            docs.append(doc)

        # POST every 1000 documents
        if len(docs) == 1000:
            print >> sys.stderr, "Processed %s documents" % count
            couch._bulk_post_to(couch.dpla_db, docs)
            docs = []

    # Last POST
    if docs:
        print >> sys.stderr, "Processed %s documents" % count
        couch.bulk_post_to(couch.dpla_db, docs)

    process_time = time.time() - start
    print >> sys.stderr, "Done"
    print >> sys.stderr, "View time: %s" % view_time
    print >> sys.stderr, "Process time: %s" % process_time
示例#22
0
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    couch = Couch()
    i_doc = couch.dashboard_db[args.ingestion_document_id]
    if i_doc['delete_process']['status'] != 'complete':
        print >> sys.stderr, 'Error: delete process did not complete'
        return 1

    # Update ingestion document to indicate that we're running
    kwargs = {'check_counts_process/status': 'running',
              'check_counts_process/start_time': iso_utc_with_tz()}
    try:
        couch.update_ingestion_doc(i_doc, **kwargs)
    except:
        tb = traceback.format_exc(5)
        print "Error updating ingestion document %s\n%s" % (i_doc["_id"], tb)
        return 1

    error_msg = None
    try:
        config = ConfigParser.ConfigParser()                                    
        config.readfp(open('akara.ini'))
        to = [s.strip() for s in config.get('Alert', 'To').split(',')]
        frm = config.get('Alert', 'From')
        body = "%s\n\n%s" % (alerts(i_doc), statistics(i_doc))
        msg = MIMEText(body)
        msg['Subject'] = "%s ingest #%s" % (i_doc['provider'],
                                            i_doc['ingestionSequence'])
        msg['To'] = ', '.join(to)
        msg['From'] = frm
        s = smtplib.SMTP("localhost")
        s.sendmail(frm, to, msg.as_string())
        s.quit()
    except Exception, e:
        error_msg = e
        tb = traceback.format_exc(5)
        print >> sys.stderr, "Error sending alert email: %s\n%s" % (error_msg,
                                                                    tb)
示例#23
0
def main(argv):
    print "WARNING: Bulk data is now exported/maintained using elasticdump."
    print "See https://github.com/dpla/automation/blob/develop/ansible/roles/exporter/files/export-provider.sh"

    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]
    if getprop(ingestion_doc,
               "dashboard_cleanup_process/status") != "complete":
        print "Error, dashboard cleanup process did not complete"
        return -1

    # Update ingestion document
    kwargs = {
        "upload_bulk_data_process/status": "running",
        "upload_bulk_data_process/start_time": iso_utc_with_tz(),
        "upload_bulk_data_process/end_time": None,
        "upload_bulk_data_process/error": None,
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    # TODO: as in the fetch_records.py script, we need profile in this scope
    #       and the file shouldn't have to be opened again 
    with open(ingestion_doc["profile_path"], "r") as profile:
        contributor = getprop(json.load(profile), "contributor/name")

    resp = export_database.main([None, "source", contributor, "upload"])
    if resp == -1:
        status = "error"
        error_msg = "Error uploading bulk data"
    else:
        status = "complete"
        error_msg = None

    # Update ingestion document
    kwargs = {
        "upload_bulk_data_process/status": status,
        "upload_bulk_data_process/error": error_msg,
        "upload_bulk_data_process/end_time": iso_utc_with_tz()
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    return 0 if status == "complete" else -1
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]
    if getprop(ingestion_doc, "save_process/status") != "complete":
        print "Error, save process did not complete"
        return -1

    # Update ingestion document
    kwargs = {
        "delete_process/status": "running",
        "delete_process/start_time": iso_utc_with_tz()
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    resp, total_deleted = couch.process_deleted_docs(ingestion_doc)
    if resp == -1:
        status = "error"
        error_msg = "Error deleting documents; only %s deleted" % total_deleted
    else:
        status = "complete"
        error_msg = None

    msg = "Total documents deleted: %s" % total_deleted
    print msg
    logger.info(msg)

    # Update ingestion document
    kwargs = {
        "delete_process/status": status,
        "delete_process/error": error_msg,
        "delete_process/end_time": iso_utc_with_tz()
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    return 0 if status == "complete" else -1
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]
    if getprop(ingestion_doc, "save_process/status") != "complete":
        print "Error, save process did not complete"
        return -1

    # Update ingestion document
    kwargs = {
        "delete_process/status": "running",
        "delete_process/start_time": datetime.now().isoformat()
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    resp, total_deleted = couch.process_deleted_docs(ingestion_doc)
    if resp == -1:
        status = "error"
        error_msg = "Error deleting documents; only %s deleted" % total_deleted
    else:
        status = "complete"
        error_msg = None

    msg = "Total documents deleted: %s" % total_deleted
    print msg
    logger.info(msg)

    # Update ingestion document
    kwargs = {
        "delete_process/status": status,
        "delete_process/error": error_msg,
        "delete_process/end_time": datetime.now().isoformat()
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    return 0 if status == "complete" else -1
示例#26
0
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]
    if getprop(ingestion_doc, "check_counts_process/status") != "complete":
        print "Error, checkk counts process did not complete"
        return -1

    # Update ingestion document
    kwargs = {
        "dashboard_cleanup_process/status": "running",
        "dashboard_cleanup_process/start_time": iso_utc_with_tz(),
        "dashboard_cleanup_process/end_time": None,
        "dashboard_cleanup_process/error": None
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    resp, total_deleted = couch.dashboard_cleanup(ingestion_doc)
    if resp == -1:
        status = "error"
        error_msg = "Error deleting documents; only %s deleted" % total_deleted
    else:
        status = "complete"
        error_msg = None
    print "Total dashboard documents deleted: %s" % total_deleted

    # Update ingestion document
    kwargs = {
        "dashboard_cleanup_process/status": status,
        "dashboard_cleanup_process/error": error_msg,
        "dashboard_cleanup_process/end_time": iso_utc_with_tz()
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    return 0 if status == "complete" else -1
示例#27
0
sitemap_path = CONFIG.get("Sitemap", "SitemapPath")

# Compress previous directory
for item in os.listdir(sitemap_path):
    item_path = os.path.join(sitemap_path, item)
    if os.path.isdir(item_path):
        with tarfile.open(item_path + ".tar.gz", "w:gz") as tar:
            tar.add(item_path, arcname=os.path.basename(item_path))
        shutil.rmtree(item_path)

# Create new directory
new_dir = os.path.join(sitemap_path, date.today().strftime("%Y%m%d"))
os.mkdir(new_dir)

# Fetch all item URLs
c = Couch()
urls = []
limit = 50000
count = 1
for doc in c._query_all_docs(c.dpla_db):
    if doc.get("ingestType") == "item":
        # Handle older ingestDates, which do not have timezone info.
        lm_dt = dateutil_parse(doc["ingestDate"])
        if lm_dt.utcoffset() is not None:
            lm = lm_dt.isoformat()
        else:
            lm = lm_dt.isoformat() + "Z"
        urls.append({"loc": "http://dp.la/item/" + doc["id"], "lastmod": lm})
    if len(urls) == limit:
        create_sitemap_files(new_dir, urls, count)
        count += 1
def main(argv=None, couch=None, provider_name=None):
    # For testing, couch and provider_name will be provided as params
    if couch:
        provider_name = provider_name
    else:
        couch = Couch()
        parser = define_arguments()
        args = parser.parse_args(argv[1:])
        provider_name = args.provider_name

    provider_legacy_docs = couch._query_all_dpla_provider_docs(provider_name)
    ingest_docs = couch._query_all_provider_ingestion_docs(provider_name)

    # Proceed only if there are no ingestion documents for the provider but
    # there are provider_legacy_docs.
    proceed = True
    if len(ingest_docs) > 0:
        num = len(ingest_docs)
        print >> sys.stderr, "Error: %s ingestion document(s) exists" % num
        proceed = False
    try:
        next_item = next(couch._query_all_dpla_provider_docs(provider_name))
    except:
        print >> sys.stderr, "Error: No documents found for %s" % provider_name
        proceed = False

    def _post(dpla_docs, dashboard_docs, ingest_doc):
        couch._bulk_post_to(couch.dpla_db, dpla_docs)
        couch._bulk_post_to(couch.dashboard_db, dashboard_docs)
        couch._update_ingestion_doc_counts(ingest_doc,
                                           countAdded=len(dashboard_docs))

    if proceed:
        ingest_doc_id = couch.create_ingestion_doc_and_backup_db(provider_name)
        ingest_doc = couch.dashboard_db[ingest_doc_id]

        docs = []
        added_docs = []
        print >> sys.stderr, "Fetching all docs..."
        count = 0
        for doc in provider_legacy_docs:
            count += 1
            doc["ingestionSequence"] = 1
            docs.append(doc)

            added_docs.append({"id": doc["_id"],
                               "type": "record",
                               "status": "added",
                               "provider": provider_name,
                               "ingestionSequence": 1})
            # POST every 1000
            if len(docs) == 1000:
                print >> sys.stderr, "Processed %s docs" % count
                _post(docs, added_docs, ingest_doc)
                # Reset
                docs = []
                added_docs = []

        # Last POST
        if docs:
            print >> sys.stderr, "Processed %s docs" % count
            _post(docs, added_docs, ingest_doc)

        print >> sys.stderr, "Complete" 
示例#29
0
def print_all_sources(arguments):
    """Print all source names"""
    couch = Couch()
    rows = couch.dpla_view("export_database/all_source_names", group=True)
    for row in rows:
        print "%(key)s (count: %(value)d)" % dict(row)
示例#30
0
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]

    # Update ingestion document
    fetch_dir = create_fetch_dir(ingestion_doc["provider"])
    kwargs = {
        "fetch_process/status": "running",
        "fetch_process/data_dir": fetch_dir,
        "fetch_process/start_time": datetime.now().isoformat()
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        logger.error("Error updating ingestion doc %s in %s" %
                     (ingestion_doc["_id"], __name__))
        return -1

    error_msg = []
    fetcher = create_fetcher(ingestion_doc["profile_path"],
                             ingestion_doc["uri_base"])

    print "Fetching records for " + fetcher.provider
    total_fetched_records = 0
    for response in fetcher.fetch_all_data():
        if response["error"]:
            error_msg.extend(iterify(response["error"]))
            print response["error"]
        else:
            # Write records to file
            filename = os.path.join(fetch_dir, str(uuid.uuid4()))
            with open(filename, "w") as f:
                f.write(json.dumps(response["data"]))
            print "Records written to " + filename
            total_fetched_records += len(getprop(response, "data/records"))

    logger.info("Total records fetched: %s" % total_fetched_records)

    # Update ingestion document
    try:
        os.rmdir(fetch_dir)
        # Error if fetch_dir was empty
        status = "error"
        error_msg.append("Error, no records fetched")
        logger.error(error_msg)
    except:
        status = "complete"
    kwargs = {
        "fetch_process/status": status,
        "fetch_process/error": error_msg,
        "fetch_process/end_time": datetime.now().isoformat()
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        logger.error("Error updating ingestion doc %s in %s" %
                     (ingestion_doc["_id"], __name__))
        return -1

    return 0 if status == "complete" else -1
示例#31
0
def main():
    couch = Couch()
    couch._sync_views()
示例#32
0
    with open(args.profile_path, "r") as f:
        try:
            profile = json.load(f)
        except Exception, e:
            print "Error reading profile: %s" % e
            return False

    if args.pipeline in profile:
        pipeline = ",".join(profile[args.pipeline])
    else:
        pipeline = args.pipeline
    provider = profile.get(u"name")
    contributor = profile.get(u"contributor", {})

    # Create ingestion document
    couch = Couch()
    ingestion_doc_id = couch.create_ingestion_doc_and_backup_db(provider)

    # Fetch provider documents
    docs = []
    count = 0
    for doc in couch._query_all_dpla_provider_docs(provider):
        docs.append(doc)
        count += 1
        # Enrich in batches of 1000
        if len(docs) == 1000:
            enriched_docs = enrich(docs, args.uri_base, pipeline)
            couch.process_and_post_to_dpla(enriched_docs, ingestion_doc_id)
            print "Enriched %s documents" % count
            docs = []
    # Enrich last batch
示例#33
0
    with open(args.profile_path, "r") as f:
        try:
            profile = json.load(f)
        except Exception, e:
            print "Error reading profile: %s" % e
            return False

    if args.pipeline in profile:
        pipeline = ",".join(profile[args.pipeline])
    else:
        pipeline = args.pipeline
    provider = profile.get(u"name")
    contributor = profile.get(u"contributor", {})

    # Create ingestion document
    couch = Couch()
    ingestion_doc_id = couch.create_ingestion_doc_and_backup_db(provider)

    # Fetch provider documents
    docs = []
    count = 0
    for doc in couch._query_all_dpla_provider_docs(provider):
        docs.append(doc)
        count += 1
        # Enrich in batches of 1000
        if len(docs) == 1000:
            enriched_docs = enrich(docs, args.uri_base, pipeline)
            couch.process_and_post_to_dpla(enriched_docs, ingestion_doc_id)
            print "Enriched %s documents" % count
            docs = []
    # Enrich last batch
示例#34
0
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]
    if getprop(ingestion_doc, "fetch_process/status") != "complete":
        print "Cannot enrich, fetch process did not complete"
        return -1

    # Update ingestion document
    status = "running"
    enrich_dir = create_enrich_dir(ingestion_doc["provider"])
    kwargs = {
        "enrich_process/status": status,
        "enrich_process/data_dir": enrich_dir,
        "enrich_process/start_time": datetime.now().isoformat(),
        "enrich_process/end_time": None,
        "enrich_process/error": None,
        "enrich_process/total_items": None,
        "enrich_process/total_collections": None,
        "enrich_process/missing_id": None,
        "enrich_process/missing_source_resource": None,
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    # Set the headers sent with the enrich request
    with open(ingestion_doc["profile_path"], "r") as f:
        profile = json.loads(f.read())
    headers = {
        "Source": ingestion_doc["provider"],
        "Content-Type": "application/json",
        "Pipeline-Item": ",".join(profile["enrichments_item"]),
        "Pipeline-Coll": ",".join(profile["enrichments_coll"]),
    }

    errors = []
    fetch_dir = getprop(ingestion_doc, "fetch_process/data_dir")

    # Counts for logger info
    enriched_items = 0
    enriched_colls = 0
    missing_id = 0
    missing_source_resource = 0

    file_count = 0
    files = os.listdir(fetch_dir)
    for filename in files:
        file_count += 1
        filepath = os.path.join(fetch_dir, filename)
        with open(filepath, "r") as f:
            try:
                data = json.loads(f.read())
            except:
                errors.append("Error loading " + filepath)
                break

        # Enrich
        print "Enriching file %s (%s of %s)" % (filepath, file_count, len(files))
        enrich_path = ingestion_doc["uri_base"] + "/enrich"
        resp, content = H.request(enrich_path, "POST", body=json.dumps(data), headers=headers)
        if not resp["status"].startswith("2"):
            errors.append("Error (status %s) enriching data from %s" % (resp["status"], filepath))
            print "Stopped enrichment process: %s" % errors
            status = "error"
            break

        data = json.loads(content)
        enriched_records = data["enriched_records"]

        # Update counts
        enriched_items += data["enriched_item_count"]
        enriched_colls += data["enriched_coll_count"]
        missing_id += data["missing_id_count"]
        missing_source_resource += data["missing_source_resource_count"]
        errors.extend(data["errors"])

        # Write enriched data to file
        with open(os.path.join(enrich_dir, filename), "w") as f:
            f.write(json.dumps(enriched_records))

    print "Enriched items: %s" % enriched_items
    print "Enriched collections: %s" % enriched_colls
    print "Missing ID: %s" % missing_id
    print "Missing sourceResource: %s" % missing_source_resource

    # Update ingestion document
    if not status == "error":
        status = "complete"
    kwargs = {
        "enrich_process/status": status,
        "enrich_process/error": errors,
        "enrich_process/end_time": datetime.now().isoformat(),
        "enrich_process/total_items": enriched_items,
        "enrich_process/total_collections": enriched_colls,
        "enrich_process/missing_id": missing_id,
        "enrich_process/missing_source_resource": missing_source_resource,
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    # Compress fetch directory, then delete
    # make_tarfile(fetch_dir)
    # shutil.rmtree(fetch_dir)

    return 0 if status == "complete" else -1
示例#35
0
    with open(args.profile_path, "r") as f:
        try:
            profile = json.load(f)
        except Exception, e:
            print "Error reading profile: %s" % e
            return False

    if args.pipeline in profile:
        pipeline = ",".join(profile[args.pipeline])
    else:
        pipeline = args.pipeline
    provider = profile.get(u"name")
    contributor = profile.get(u"contributor", {})

    # Create ingestion document
    couch = Couch()
    ingestion_doc_id = create_ingestion_document.main([None,
                                                       args.profile_path])
    ingestion_doc = couch.dashboard_db[ingestion_doc_id]

    # Update ingestion document
    kwargs = {
        "poll_storage_process/status": "running",
        "poll_storage_process/start_time": iso_utc_with_tz(),
        "poll_storage_process/end_time": None,
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return
示例#36
0
sitemap_path = CONFIG.get("Sitemap", "SitemapPath")

# Compress previous directory
for item in os.listdir(sitemap_path):
    item_path = os.path.join(sitemap_path, item)
    if os.path.isdir(item_path):
        with tarfile.open(item_path + ".tar.gz", "w:gz") as tar:
            tar.add(item_path, arcname=os.path.basename(item_path))
        shutil.rmtree(item_path)

# Create new directory
new_dir = os.path.join(sitemap_path, date.today().strftime("%Y%m%d"))
os.mkdir(new_dir)

# Fetch all item URLs
c = Couch()
urls = []
limit = 50000
count = 1
for doc in c._query_all_docs(c.dpla_db):
    if doc.get("ingestType") == "item":
        # Handle older ingestDates, which do not have timezone info.
        lm_dt = dateutil_parse(doc["ingestDate"])
        if lm_dt.utcoffset() is not None:
            lm = lm_dt.isoformat()
        else:
            lm = lm_dt.isoformat() + "Z"
        urls.append({"loc": "http://dp.la/item/" + doc["id"], "lastmod": lm})
    if len(urls) == limit:
        create_sitemap_files(new_dir, urls, count)
        count += 1
示例#37
0
def main(argv):
    global threads_working
    parser = define_arguments()
    args = parser.parse_args(argv[1:])
    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]
    fetch_dir = getprop(ingestion_doc, "fetch_process/data_dir")
    enrich_dir = create_enrich_dir(ingestion_doc["provider"])
    if getprop(ingestion_doc, "fetch_process/status") != "complete":
        print >> sys.stderr, "Cannot enrich, fetch process did not complete"
        return 1

    # Update ingestion document
    status = "running"
    kwargs = {
        "enrich_process/status": status,
        "enrich_process/data_dir": enrich_dir,
        "enrich_process/start_time": iso_utc_with_tz(),
        "enrich_process/end_time": None,
        "enrich_process/error": None,
        "enrich_process/total_items": None,
        "enrich_process/total_collections": None,
        "enrich_process/missing_id": None,
        "enrich_process/missing_source_resource": None
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print >> sys.stderr, "Error updating ingestion document " + \
                ingestion_doc["_id"]
        return 1

    with open(ingestion_doc["profile_path"], "r") as f:
        profile = json.loads(f.read())

    # Counts for logger info
    stats = {'enriched_items': 0,
             'enriched_colls': 0,
             'missing_id': 0,
             'missing_source_resource': 0
            }

    # Initialize queue and threads
    queue, thread_errors = queue_and_errors(ingestion_doc=ingestion_doc,
                                            profile=profile,
                                            stats=stats,
                                            enrich_dir=enrich_dir)
    # Initialize list of input files
    listing = os.listdir(fetch_dir)
    # Initialize counters and statistics
    dashboard_errors = []
    file_count = 0
    status = None
    total_files = len(listing)
    files = iter(listing)

    try:
        # Keep the queue full of filenames
        while True:
            time.sleep(0.25)
            try:
                if print_errors_thrown(thread_errors):
                    dashboard_errors.extend(thread_errors)
                    raise Exception()
                if not queue.full():
                    basename = files.next()
                    filename = os.path.join(fetch_dir, basename)
                    file_count += 1
                    print "Enqueuing: %s (%s of %s)" % \
                            (filename, file_count, total_files)
                    queue.put(filename)
            except StopIteration:
                break
        # Wait for queue to be empty before returning
        while True:
            if queue.empty() and not threads_working:
                if not print_errors_thrown(thread_errors):
                    break
                else:
                    dashboard_errors.extend(thread_errors)
                    raise Exception()
            time.sleep(0.25)
    except KeyboardInterrupt:
        status = "error"
        msg = "\nCaught keyboard interrupt"
        print >> sys.stderr, msg
        dashboard_errors.append(msg)
    except Exception as e:
        if e.message:
            print >> sys.stderr, e.message
        status = "error"
        dashboard_errors.append(e.message)
    finally:
        print "Enriched items: %s" % stats['enriched_items']
        print "Enriched collections: %s" % stats['enriched_colls']
        print "Missing ID: %s" % stats['missing_id']
        print "Missing sourceResource: %s" % stats['missing_source_resource']
        if not status == "error":
            status = "complete"
        # Prepare fields for ingestion document update
        couch_kwargs = {
            "enrich_process/status": status,
            "enrich_process/error": dashboard_errors,
            "enrich_process/end_time": iso_utc_with_tz(),
            "enrich_process/total_items": stats['enriched_items'],
            "enrich_process/total_collections": stats['enriched_colls'],
            "enrich_process/missing_id": stats['missing_id'],
            "enrich_process/missing_source_resource": \
                    stats['missing_source_resource']
        }

    try:
        # Update ingestion document
        couch.update_ingestion_doc(ingestion_doc, **couch_kwargs)
    except:
        print >> sys.stderr, "Error updating ingestion document " + \
                             ingestion_doc["_id"]
        return 1

    return 0 if status == "complete" else 1
示例#38
0
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]
    if getprop(ingestion_doc, "fetch_process/status") != "complete":
        print "Cannot enrich, fetch process did not complete"
        return -1

    # Update ingestion document
    enrich_dir = create_enrich_dir(ingestion_doc["provider"])
    kwargs = {
        "enrich_process/status": "running",
        "enrich_process/data_dir": enrich_dir,
        "enrich_process/start_time": datetime.now().isoformat()
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    # Set the headers sent with the enrich request
    with open(ingestion_doc["profile_path"], "r") as f:
        profile = json.loads(f.read())
    headers = {
        "Source": ingestion_doc["provider"],
        "Collection": "",
        "Content-Type": "application/json",
        "Pipeline-Rec": ",".join(profile["enrichments_rec"]),
        "Pipeline-Coll": ",".join(profile["enrichments_coll"])
    }

    error_msg = None
    fetch_dir = getprop(ingestion_doc, "fetch_process/data_dir")
    
    total_enriched_records = 0
    for filename in os.listdir(fetch_dir):
        filepath = os.path.join(fetch_dir, filename)
        with open(filepath, "r") as f:
            try:
                data = json.loads(f.read())
            except:
                error_msg = "Error loading " + filepath
                break

        # Enrich
        print "Enriching file " + filepath
        enrich_path = ingestion_doc["uri_base"] + "/enrich"
        resp, content = H.request(enrich_path, "POST", body=json.dumps(data),
                                  headers=headers)
        if not resp["status"].startswith("2"):
            error_msg = "Error (status %s) enriching data from %s" % \
                        (resp["status"], filepath)
            print "Stopped enrichment process: " + error_msg
            break

        data = json.loads(content)
        enriched_records = data["enriched_records"]
        total_enriched_records += data["enriched_records_count"]

        # Write enriched data to file
        with open(os.path.join(enrich_dir, filename), "w") as f:
            f.write(json.dumps(enriched_records))

    logger.info("Total records enriched: %s" % total_enriched_records)

    # Update ingestion document
    if error_msg is not None:
        status = "error"
    else:
        status = "complete"
    kwargs = {
        "enrich_process/status": status,
        "enrich_process/error": error_msg,
        "enrich_process/end_time": datetime.now().isoformat()
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    # Compress fetch directory, then delete
    make_tarfile(fetch_dir)
    shutil.rmtree(fetch_dir)

    return 0 if status == "complete" else -1
示例#39
0
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]

    # Update ingestion document
    fetch_dir = create_fetch_dir(ingestion_doc["provider"])
    kwargs = {
        "fetch_process/status": "running",
        "fetch_process/data_dir": fetch_dir,
        "fetch_process/start_time": datetime.now().isoformat(),
        "fetch_process/end_time": None,
        "fetch_process/error": None,
        "fetch_process/total_items": None,
        "fetch_process/total_collections": None,
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        logger.error("Error updating ingestion doc %s in %s" % (ingestion_doc["_id"], __name__))
        return -1

    error_msg = []
    config_file = "akara.ini"
    fetcher = create_fetcher(ingestion_doc["profile_path"], ingestion_doc["uri_base"], config_file)

    print "Fetching records for " + fetcher.provider
    total_items = 0
    total_collections = 0
    for response in fetcher.fetch_all_data():
        if response["errors"]:
            error_msg.extend(iterify(response["errors"]))
            print response["errors"]
        if response["records"]:
            # Write records to file
            filename = os.path.join(fetch_dir, str(uuid.uuid4()))
            with open(filename, "w") as f:
                f.write(json.dumps(response["records"]))

            items = len([record for record in response["records"] if not record.get("ingestType") == "collection"])
            total_items += items
            total_collections += len(response["records"]) - items

    print "Total items: %s" % total_items
    print "Total collections: %s" % total_collections

    # Update ingestion document
    try:
        os.rmdir(fetch_dir)
        # Error if fetch_dir was empty
        status = "error"
        error_msg.append("Error, no records fetched")
        logger.error(error_msg)
    except:
        status = "complete"
    kwargs = {
        "fetch_process/status": status,
        "fetch_process/error": error_msg,
        "fetch_process/end_time": datetime.now().isoformat(),
        "fetch_process/total_items": total_items,
        "fetch_process/total_collections": total_collections,
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        logger.error("Error updating ingestion doc %s in %s" % (ingestion_doc["_id"], __name__))
        return -1

    return 0 if status == "complete" else -1
def main(argv=None, couch=None, provider_name=None):
    # For testing, couch and provider_name will be provided as params
    if couch:
        provider_name = provider_name
    else:
        couch = Couch()
        parser = define_arguments()
        args = parser.parse_args(argv[1:])
        provider_name = args.provider_name

    provider_legacy_docs = couch._query_all_dpla_provider_docs(provider_name)
    ingest_docs = couch._query_all_provider_ingestion_docs(provider_name)

    # Proceed only if there are no ingestion documents for the provider but
    # there are provider_legacy_docs.
    proceed = True
    if len(ingest_docs) > 0:
        num = len(ingest_docs)
        print >> sys.stderr, "Error: %s ingestion document(s) exists" % num
        proceed = False
    try:
        next_item = next(couch._query_all_dpla_provider_docs(provider_name))
    except:
        print >> sys.stderr, "Error: No documents found for %s" % provider_name
        proceed = False

    def _post(dpla_docs, dashboard_docs, ingest_doc_id):
        couch._bulk_post_to(couch.dpla_db, dpla_docs)
        couch._bulk_post_to(couch.dashboard_db, dashboard_docs)
        couch._update_ingestion_doc_counts(ingest_doc_id,
                                           countAdded=len(dashboard_docs))

    if proceed:
        ingest_doc_id = couch.create_ingestion_doc_and_backup_db(provider_name)

        docs = []
        added_docs = []
        print >> sys.stderr, "Fetching all docs..."
        count = 0
        for doc in provider_legacy_docs:
            count += 1
            doc["ingestionSequence"] = 1
            docs.append(doc)

            added_docs.append({
                "id": doc["_id"],
                "type": "record",
                "status": "added",
                "provider": provider_name,
                "ingestionSequence": 1
            })
            # POST every 1000
            if len(docs) == 1000:
                print >> sys.stderr, "Processed %s docs" % count
                _post(docs, added_docs, ingest_doc_id)
                # Reset
                docs = []
                added_docs = []

        # Last POST
        if docs:
            print >> sys.stderr, "Processed %s docs" % count
            _post(docs, added_docs, ingest_doc_id)

        print >> sys.stderr, "Complete"
示例#41
0
    with open(args.profile_path, "r") as f:
        try:
            profile = json.load(f)
        except Exception, e:
            print "Error reading profile: %s" % e
            return False

    if args.pipeline in profile:
        pipeline = ",".join(profile[args.pipeline])
    else:
        pipeline = args.pipeline
    provider = profile.get(u"name")
    contributor = profile.get(u"contributor", {})

    # Create ingestion document
    couch = Couch()
    ingestion_doc_id = create_ingestion_document.main([None,
                                                       args.profile_path])
    ingestion_doc = couch.dashboard_db[ingestion_doc_id]
    ingestion_doc["poll_storage_process"] = {"status": "running"}
    couch.dashboard_db.update([ingestion_doc])

    # Fetch provider documents
    docs = []
    count = 0
    for doc in couch._query_all_dpla_provider_docs(provider):
        docs.append(doc)
        count += 1
        # Enrich in batches of 1000
        if len(docs) == 1000:
            enriched_docs = enrich(docs, args.uri_base, pipeline)
示例#42
0
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]

    # TODO:  profile should be passed to create_fetcher, below.
    #        We need profile in this scope, and the file shouldn't have to
    #        be opened again by create_fetcher.
    with open(ingestion_doc["profile_path"], "r") as f:
        profile = json.load(f)

    # Update ingestion document
    fetch_dir = create_fetch_dir(ingestion_doc["provider"])
    kwargs = {
        "fetch_process/status": "running",
        "fetch_process/data_dir": fetch_dir,
        "fetch_process/start_time": iso_utc_with_tz(),
        "fetch_process/end_time": None,
        "fetch_process/error": None,
        "fetch_process/total_items": None,
        "fetch_process/total_collections": None
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        logger.error("Error updating ingestion doc %s in %s" %
                     (ingestion_doc["_id"], __name__))
        return -1

    error_msg = []
    config_file = os.environ.get("DPLA_CONFIG_FILE", "akara.ini")

    fetcher = create_fetcher(ingestion_doc["profile_path"],
                             ingestion_doc["uri_base"],
                             config_file)

    print "Fetching records for %s" % ingestion_doc["provider"]
    stats = {
        "total_items": 0,
        "total_collections": 0
    }
    try:
        threads = int(profile.get("fetcher_threads")) or 1
        print "Threads: %d" % threads
    except:
        print >> sys.stderr, "Can not determine fetcher threads, so using 1"
        threads = 1
    sets = None
    sets_supported = (profile.get("sets") != "NotSupported")
    if threads > 1 and sets_supported and hasattr(fetcher, "fetch_sets"):
        sets_err, s = fetcher.fetch_sets()
        if s:
            sets = iter(s)
        else:
            print >> sys.stderr, "Could not get sets: ", sets_err
            return -1
        queue, th_errors, d_errors = queue_and_errors(threads,
                                                      ingestion_doc,
                                                      config_file,
                                                      fetch_dir,
                                                      stats)
        status = None
        try:
            while True:
                time.sleep(0.1)
                try:
                    if print_errors_thrown(th_errors):
                        d_errors.extend(th_errors)
                        raise Exception()
                    if not queue.full():
                        queue.put(sets.next())
                except StopIteration:
                    break
            # Wait for queue to be empty before returning
            while True:
                if queue.empty() and not threads_working:
                    if not print_errors_thrown(th_errors):
                        break
                    else:
                        d_errors.extend(th_errors)
                        raise Exception()
                time.sleep(0.1)
        except KeyboardInterrupt:
            status = "error"
            msg = "\nCaught keyboard interrupt"
            print >> sys.stderr, msg
            d_errors.append(msg)
        except Exception as e:
            if e.message:
                print >> sys.stderr, e.message
            status = "error"
            d_errors.append(e.message)
        finally:
            if not status == "error":
                status = "complete"
    else:  # not threads
        rv = fetch_all_for_set(None, fetcher, fetch_dir)
        stats["total_items"] += rv["total_items"]
        stats["total_collections"] += rv["total_collections"]
        error_msg += rv["errors"]

    print "Total items: %s" % stats["total_items"]
    print "Total collections: %s" % stats["total_collections"]


    # Update ingestion document
    try:
        os.rmdir(fetch_dir)
        # Error if fetch_dir was empty
        status = "error"
        error_msg.append("Error, no records fetched")
        logger.error(error_msg)
    except:
        status = "complete"
    kwargs = {
        "fetch_process/status": status,
        "fetch_process/error": error_msg,
        "fetch_process/end_time": iso_utc_with_tz(),
        "fetch_process/total_items": stats["total_items"],
        "fetch_process/total_collections": stats["total_collections"]
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        logger.error("Error updating ingestion doc %s in %s" %
                     (ingestion_doc["_id"], __name__))
        return -1

    return 0 if status == "complete" else -1
示例#43
0
    with open(args.profile_path, "r") as f:
        try:
            profile = json.load(f)
        except Exception, e:
            print "Error reading profile: %s" % e
            return False

    if args.pipeline in profile:
        pipeline = ",".join(profile[args.pipeline])
    else:
        pipeline = args.pipeline
    provider = profile.get(u"name")
    contributor = profile.get(u"contributor", {})

    # Create ingestion document
    couch = Couch()
    ingestion_doc_id = create_ingestion_document.main([None,
                                                       args.profile_path])
    ingestion_doc = couch.dashboard_db[ingestion_doc_id]

    # Update ingestion document
    kwargs = {
        "poll_storage_process/status": "running",
        "poll_storage_process/start_time": datetime.now().isoformat(),
        "poll_storage_process/end_time": None,
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return
示例#44
0
def main(argv):
    global threads_working
    parser = define_arguments()
    args = parser.parse_args(argv[1:])
    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]
    fetch_dir = getprop(ingestion_doc, "fetch_process/data_dir")
    enrich_dir = create_enrich_dir(ingestion_doc["provider"])
    if getprop(ingestion_doc, "fetch_process/status") != "complete":
        print >> sys.stderr, "Cannot enrich, fetch process did not complete"
        return 1

    # Update ingestion document
    status = "running"
    kwargs = {
        "enrich_process/status": status,
        "enrich_process/data_dir": enrich_dir,
        "enrich_process/start_time": iso_utc_with_tz(),
        "enrich_process/end_time": None,
        "enrich_process/error": None,
        "enrich_process/total_items": None,
        "enrich_process/total_collections": None,
        "enrich_process/missing_id": None,
        "enrich_process/missing_source_resource": None
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print >> sys.stderr, "Error updating ingestion document " + \
                ingestion_doc["_id"]
        return 1

    with open(ingestion_doc["profile_path"], "r") as f:
        profile = json.loads(f.read())

    # Counts for logger info
    stats = {
        'enriched_items': 0,
        'enriched_colls': 0,
        'missing_id': 0,
        'missing_source_resource': 0
    }

    # Initialize queue and threads
    queue, thread_errors = queue_and_errors(ingestion_doc=ingestion_doc,
                                            profile=profile,
                                            stats=stats,
                                            enrich_dir=enrich_dir)
    # Initialize list of input files
    listing = os.listdir(fetch_dir)
    # Initialize counters and statistics
    dashboard_errors = []
    file_count = 0
    status = None
    total_files = len(listing)
    files = iter(listing)

    try:
        # Keep the queue full of filenames
        while True:
            time.sleep(0.25)
            try:
                if print_errors_thrown(thread_errors):
                    dashboard_errors.extend(thread_errors)
                    raise Exception()
                if not queue.full():
                    basename = files.next()
                    filename = os.path.join(fetch_dir, basename)
                    file_count += 1
                    print "Enqueuing: %s (%s of %s)" % \
                            (filename, file_count, total_files)
                    queue.put(filename)
            except StopIteration:
                break
        # Wait for queue to be empty before returning
        while True:
            if queue.empty() and not threads_working:
                if not print_errors_thrown(thread_errors):
                    break
                else:
                    dashboard_errors.extend(thread_errors)
                    raise Exception()
            time.sleep(0.25)
    except KeyboardInterrupt:
        status = "error"
        msg = "\nCaught keyboard interrupt"
        print >> sys.stderr, msg
        dashboard_errors.append(msg)
    except Exception as e:
        if e.message:
            print >> sys.stderr, e.message
        status = "error"
        dashboard_errors.append(e.message)
    finally:
        print "Enriched items: %s" % stats['enriched_items']
        print "Enriched collections: %s" % stats['enriched_colls']
        print "Missing ID: %s" % stats['missing_id']
        print "Missing sourceResource: %s" % stats['missing_source_resource']
        if not status == "error":
            status = "complete"
        # Prepare fields for ingestion document update
        couch_kwargs = {
            "enrich_process/status": status,
            "enrich_process/error": dashboard_errors,
            "enrich_process/end_time": iso_utc_with_tz(),
            "enrich_process/total_items": stats['enriched_items'],
            "enrich_process/total_collections": stats['enriched_colls'],
            "enrich_process/missing_id": stats['missing_id'],
            "enrich_process/missing_source_resource": \
                    stats['missing_source_resource']
        }

    try:
        # Update ingestion document
        couch.update_ingestion_doc(ingestion_doc, **couch_kwargs)
    except:
        print >> sys.stderr, "Error updating ingestion document " + \
                             ingestion_doc["_id"]
        return 1

    return 0 if status == "complete" else 1
示例#45
0
def print_all_sources(arguments):
    """Print all source names"""
    couch = Couch()
    rows = couch.dpla_view("export_database/all_source_names", group=True)
    for row in rows:
        print "%(key)s (count: %(value)d)" % dict(row)
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    batch_size = 500

    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]
    if getprop(ingestion_doc, "enrich_process/status") != "complete":
        print "Cannot save, enrich process did not complete"
        return -1

    # Update ingestion document
    kwargs = {
        "save_process/status": "running",
        "save_process/start_time": iso_utc_with_tz(),
        "save_process/end_time": None,
        "save_process/error": None,
        "save_process/total_saved": None
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    # Back up provider data
    if args.backup:
        resp = couch._back_up_data(ingestion_doc)

        if resp == -1:
            # Fatal error, do not continue with save process
            kwargs = {
                "save_process/status": "error",
                "save_process/end_time": iso_utc_with_tz(),
                "save_process/error": "Error backing up DPLA records"
            }
            couch.update_ingestion_doc(ingestion_doc, **kwargs)
            return resp

    error_msg = None
    enrich_dir = getprop(ingestion_doc, "enrich_process/data_dir")
    total_items = 0
    total_collections = 0
    sync_point = 5000
    docs = {}
    for file in os.listdir(enrich_dir):
        filename = os.path.join(enrich_dir, file)
        with open(filename, "r") as f:
            try:
                file_docs = json.loads(f.read())
            except:
                error_msg = "Error loading " + filename
                break

        # Save when docs is about to exceed the batch size
        print >> sys.stderr, "Read file %s" % filename
        if docs and len(docs) + len(file_docs) > batch_size:
            resp, error_msg = couch.process_and_post_to_dpla(docs,
                                                             ingestion_doc)
            if resp == -1:
                docs = None
                break

            items = len([doc for doc in docs.values() if
                         doc.get("ingestType") == "item"])
            total_items += items
            total_collections += len(docs) - items
            print "Saved %s documents" % (total_items + total_collections)

            #if total_items > sync_point:
            #    print "Syncing views"
            #    couch.sync_views(couch.dpla_db.name)
            #    sync_point = total_items + 10000

            # Set docs for the next iteration
            docs = file_docs
        else:
            docs.update(file_docs)

    # Last save
    if docs:
        resp, error_msg = couch.process_and_post_to_dpla(docs,
                                                         ingestion_doc)
        if resp != -1:
            items = len([doc for doc in docs.values() if
                         doc.get("ingestType") == "item"])
            total_items += items
            total_collections += len(docs) - items
            print "Saved %s documents" % (total_items + total_collections)
            #print "Syncing views"
            #couch.sync_views(couch.dpla_db.name)

    print "Total items: %s" % total_items
    print "Total collections: %s" % total_collections

    if error_msg:
        status = "error"
    else:
        status = "complete"
    kwargs = {
        "save_process/status": status,
        "save_process/error": error_msg,
        "save_process/end_time": iso_utc_with_tz(),
        "save_process/total_items": total_items,
        "save_process/total_collections": total_collections
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    # Compress enrich dir, then delete
    make_tarfile(enrich_dir)
    shutil.rmtree(enrich_dir)

    return total_items if status == "complete" else -1
示例#47
0
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]

    # TODO:  profile should be passed to create_fetcher, below.
    #        We need profile in this scope, and the file shouldn't have to
    #        be opened again by create_fetcher.
    with open(ingestion_doc["profile_path"], "r") as f:
        profile = json.load(f)

    # Update ingestion document
    fetch_dir = create_fetch_dir(ingestion_doc["provider"])
    kwargs = {
        "fetch_process/status": "running",
        "fetch_process/data_dir": fetch_dir,
        "fetch_process/start_time": iso_utc_with_tz(),
        "fetch_process/end_time": None,
        "fetch_process/error": None,
        "fetch_process/total_items": None,
        "fetch_process/total_collections": None
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        logger.error("Error updating ingestion doc %s in %s" %
                     (ingestion_doc["_id"], __name__))
        return -1

    error_msg = []
    config_file = "akara.ini"

    fetcher = create_fetcher(ingestion_doc["profile_path"],
                             ingestion_doc["uri_base"],
                             config_file)

    print "Fetching records for %s" % ingestion_doc["provider"]
    stats = {
        "total_items": 0,
        "total_collections": 0
    }
    try:
        threads = int(profile.get("fetcher_threads")) or 1
        print "Threads: %d" % threads
    except:
        print >> sys.stderr, "Can not determine fetcher threads, so using 1"
        threads = 1
    sets = None
    sets_supported = (profile.get("sets") != "NotSupported")
    if threads > 1 and sets_supported and hasattr(fetcher, "fetch_sets"):
        sets_err, s = fetcher.fetch_sets()
        if s:
            sets = iter(s)
        else:
            print >> sys.stderr, "Could not get sets: ", sets_err
            return -1
        queue, th_errors, d_errors = queue_and_errors(threads,
                                                      ingestion_doc,
                                                      config_file,
                                                      fetch_dir,
                                                      stats)
        status = None
        try:
            while True:
                time.sleep(0.1)
                try:
                    if print_errors_thrown(th_errors):
                        d_errors.extend(th_errors)
                        raise Exception()
                    if not queue.full():
                        queue.put(sets.next())
                except StopIteration:
                    break
            # Wait for queue to be empty before returning
            while True:
                if queue.empty() and not threads_working:
                    if not print_errors_thrown(th_errors):
                        break
                    else:
                        d_errors.extend(th_errors)
                        raise Exception()
                time.sleep(0.1)
        except KeyboardInterrupt:
            status = "error"
            msg = "\nCaught keyboard interrupt"
            print >> sys.stderr, msg
            d_errors.append(msg)
        except Exception as e:
            if e.message:
                print >> sys.stderr, e.message
            status = "error"
            d_errors.append(e.message)
        finally:
            if not status == "error":
                status = "complete"
    else:  # not threads
        rv = fetch_all_for_set(None, fetcher, fetch_dir)
        stats["total_items"] += rv["total_items"]
        stats["total_collections"] += rv["total_collections"]
        error_msg += rv["errors"]

    print "Total items: %s" % stats["total_items"]
    print "Total collections: %s" % stats["total_collections"]


    # Update ingestion document
    try:
        os.rmdir(fetch_dir)
        # Error if fetch_dir was empty
        status = "error"
        error_msg.append("Error, no records fetched")
        logger.error(error_msg)
    except:
        status = "complete"
    kwargs = {
        "fetch_process/status": status,
        "fetch_process/error": error_msg,
        "fetch_process/end_time": iso_utc_with_tz(),
        "fetch_process/total_items": stats["total_items"],
        "fetch_process/total_collections": stats["total_collections"]
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        logger.error("Error updating ingestion doc %s in %s" %
                     (ingestion_doc["_id"], __name__))
        return -1

    return 0 if status == "complete" else -1
示例#48
0
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    config = ConfigParser.ConfigParser()
    config.readfp(open("akara.ini"))
    batch_size = int(config.get("CouchDb", "BatchSize"))

    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]
    if getprop(ingestion_doc, "enrich_process/status") != "complete":
        print "Cannot save, enrich process did not complete"
        return -1

    # Update ingestion document
    kwargs = {
        "save_process/status": "running",
        "save_process/start_time": datetime.now().isoformat(),
        "save_process/end_time": None,
        "save_process/error": None,
        "save_process/total_saved": None
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    # Back up provider data
    if args.backup:
        resp = couch._back_up_data(ingestion_doc)

        if resp == -1:
            # Fatal error, do not continue with save process
            kwargs = {
                "save_process/status": "error",
                "save_process/end_time": datetime.now().isoformat(),
                "save_process/error": "Error backing up DPLA records"
            }
            couch.update_ingestion_doc(ingestion_doc, **kwargs)
            return resp

    error_msg = None
    enrich_dir = getprop(ingestion_doc, "enrich_process/data_dir")
    total_items = 0
    total_collections = 0
    docs = {}
    for file in os.listdir(enrich_dir):
        filename = os.path.join(enrich_dir, file)
        with open(filename, "r") as f:
            try:
                file_docs = json.loads(f.read())
            except:
                error_msg = "Error loading " + filename
                break

        # Save when docs is about to exceed the batch size
        print >> sys.stderr, "Read file %s" % filename
        if docs and len(docs) + len(file_docs) > batch_size:
            resp, error_msg = couch.process_and_post_to_dpla(docs,
                                                             ingestion_doc)
            if resp == -1:
                docs = None
                break

            items = len([doc for doc in docs.values() if
                         doc.get("ingestType") == "item"])
            total_items += items
            total_collections += len(docs) - items
            print "Saved %s documents" % (total_items + total_collections)

            # Set docs for the next iteration
            docs = file_docs
        else:
            docs.update(file_docs)

    # Last save
    if docs:
        resp, error_msg = couch.process_and_post_to_dpla(docs,
                                                         ingestion_doc)
        if resp != -1:
            items = len([doc for doc in docs.values() if
                         doc.get("ingestType") == "item"])
            total_items += items
            total_collections += len(docs) - items
            print "Saved %s documents" % (total_items + total_collections)

    print "Total items: %s" % total_items
    print "Total collections: %s" % total_collections

    if error_msg:
        status = "error"
    else:
        status = "complete"
    kwargs = {
        "save_process/status": status,
        "save_process/error": error_msg,
        "save_process/end_time": datetime.now().isoformat(),
        "save_process/total_items": total_items,
        "save_process/total_collections": total_collections
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    # Compress enrich dir, then delete
    make_tarfile(enrich_dir)
    shutil.rmtree(enrich_dir)

    return 0 if status == "complete" else -1
def main():
    couch = Couch()
    couch._sync_views()