def update_bulk_download_document(provider, file_path, file_size): c = Couch() bulk_download_doc_id = c.update_bulk_download_document( provider, file_path, file_size ) print "Updated bulk_download database document with ID %s" % \ bulk_download_doc_id
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) config_file = ("akara.ini") config = ConfigParser.ConfigParser() config.readfp(open(config_file)) uri_base = "http://localhost:" + config.get("Akara", "Port") with open(args.profile_path, "r") as f: try: profile = json.load(f) except: print "Error, could not load profile in %s" % __name__ return None provider = profile["name"] couch = Couch() latest_ingestion_doc = couch._get_last_ingestion_doc_for(provider) if latest_ingestion_doc and \ getprop(latest_ingestion_doc, "delete_process/status") != "complete": error_msg = "Error, last ingestion did not complete. Review " + \ "dashboard document %s for errors." % \ latest_ingestion_doc["_id"] logger.error(error_msg) print error_msg return None ingestion_document_id = couch._create_ingestion_document(provider, uri_base, args.profile_path) logger.debug("Ingestion document %s created." % ingestion_document_id) return ingestion_document_id
def profile_names_for_contributor(contributor): """Return a list of profile names that use the given contributor name""" profiles = {} couch = Couch() view = "export_database/profile_and_source_names" for row in couch.dpla_view(view, group=True): k = row["key"] profiles.setdefault(k[0], []).append(k[1]) return profiles.get(contributor, [])
def item_docs(provider_name=None): """Yield all item documents for the given provider, else all providers""" couch = Couch() if provider_name: docs = couch._query_all_dpla_provider_docs(provider_name) else: docs = couch.all_dpla_docs() for doc in docs: if doc.get("ingestType") == "item": yield doc
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() database_names = ["dpla", "dashboard", "bulk_download"] if args.database_name in database_names: couch.sync_views(args.database_name) else: print >> sys.stderr, "The database_name parameter should be " + \ "either %s" % " or ".join(database_names)
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() if args.database_name in ["dpla", "dashboard"]: couch._sync_views(args.database_name) else: print >> sys.stderr, "The database_name parameter should be " + \ "either \"dpla\" or \"dashboard\""
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch(dpla_db_name=args.database_name, dashboard_db_name='dashboard') database_names = ["dpla", "dashboard", "bulk_download", "ucldc"] if args.database_name in database_names: print "couch.sync_views(" + args.database_name + ") next!" couch.sync_views(args.database_name) else: print >> sys.stderr, "The database_name parameter should be " + \ "either \"dpla\" or \"dashboard\" or \"ucldc\" \
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] if getprop(ingestion_doc, "delete_process/status") != "complete": print "Error, delete process did not complete" return -1 # Update ingestion document kwargs = { "check_counts_process/status": "running", "check_counts_process/start_time": iso_utc_with_tz() } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 # Check each count against the threshold alerts = [] count_type = ("Added", "Changed", "Deleted") for ctype in count_type: count = int(ingestion_doc["count" + ctype]) threshold = int(ingestion_doc["thresholds"][ctype.lower()]) if count > threshold: alerts.append("%s items %s exceeds threshold of %s" % (count, ctype.lower(), threshold)) error_msg = None if alerts: config_file = "akara.ini" config = ConfigParser.ConfigParser() config.readfp(open(config_file)) to = [s.strip() for s in config.get("Alert", "To").split(",")] frm = config.get("Alert", "From") month = dateparser.parse(ingestion_doc["ingestDate"]).strftime("%B") alerts = "\n".join(alerts) msg = MIMEText(alerts) msg["Subject"] = "Threshold(s) exceeded for %s ingestion of %s" % \ (month, ingestion_doc["provider"]) msg["To"] = ", ".join(to) msg["From"] = frm try: s = smtplib.SMTP("localhost") s.sendmail(frm, to, msg.as_string()) s.quit() except Exception, e: error_msg = e
def main(argv): couch = Couch() parser = define_arguments() args = parser.parse_args(argv[1:]) with open(args.profile_path, "r") as f: profile = json.load(f) provider = profile.get("name") if confirm_deletion(provider): couch._delete_all_provider_documents(provider) else: return False
def download_each_source_data(arguments): """Gets a list of all sources in the Couch database and downloads each source's data. Arguments: arguments - dictionary returned by the validate_arguments function """ couch = Couch() rows = couch.dpla_view("export_database/all_source_names", group=True) for row in rows: arguments["source"] = row["key"] print arguments["source"] download_data(arguments)
def get_enrich_dir(ingestion_document_id): couch = Couch() ingestion_doc = couch.dashboard_db[ingestion_document_id] if getprop(ingestion_doc, "enrich_process/status") != "complete": raise AssertionError( "Cannot save Avro files, enrich process did not complete") return getprop(ingestion_doc, "enrich_process/data_dir")
def main(): couch = Couch() new_fields = { "fetch_process": {"status": "complete"}, "enrich_process": {"status": "complete"}, "save_process": {"status": "complete"}, "delete_process": {"status": "complete"}, } for profile in os.listdir("profiles/"): if profile.endswith(".pjs"): with open("profiles/" + profile, "r") as f: p = json.loads(f.read()) provider = p["name"] for doc in couch._query_all_provider_ingestion_docs(provider): doc.update(new_fields) couch.dashboard_db.update([doc])
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() i_doc = couch.dashboard_db[args.ingestion_document_id] if i_doc['delete_process']['status'] != 'complete': print >> sys.stderr, 'Error: delete process did not complete' return 1 # Update ingestion document to indicate that we're running kwargs = { 'check_counts_process/status': 'running', 'check_counts_process/start_time': iso_utc_with_tz() } try: couch.update_ingestion_doc(i_doc, **kwargs) except: tb = traceback.format_exc(5) print "Error updating ingestion document %s\n%s" % (i_doc["_id"], tb) return 1 error_msg = None try: config = ConfigParser.ConfigParser() config.readfp(open('akara.ini')) to = [s.strip() for s in config.get('Alert', 'To').split(',')] frm = config.get('Alert', 'From') body = "%s\n\n%s" % (alerts(i_doc), statistics(i_doc)) msg = MIMEText(body) msg['Subject'] = "%s ingest #%s" % (i_doc['provider'], i_doc['ingestionSequence']) msg['To'] = ', '.join(to) msg['From'] = frm s = smtplib.SMTP("localhost") s.sendmail(frm, to, msg.as_string()) s.quit() except Exception, e: error_msg = e tb = traceback.format_exc(5) print >> sys.stderr, "Error sending alert email: %s\n%s" % (error_msg, tb)
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) config_file = os.environ.get("DPLA_CONFIG_FILE", "akara.ini") config = ConfigParser.ConfigParser() config.readfp(open(config_file)) uri_base = "http://localhost:" + config.get("Akara", "Port") with open(args.profile_path, "r") as f: try: profile = json.load(f) except: print "Error, could not load profile in %s" % __name__ return None provider = profile["name"] thresholds = profile["thresholds"] fetcher_threads = profile.get("fetcher_threads") or 1 couch = Couch() latest_ingestion_doc = couch._get_last_ingestion_doc_for(provider) if (latest_ingestion_doc and getprop(latest_ingestion_doc, "dashboard_cleanup_process/status") != "complete"): error_msg = "Error, last ingestion did not complete. Review " + \ "dashboard document %s for errors." % \ latest_ingestion_doc["_id"] logger.error(error_msg) print error_msg return None ingestion_document_id = couch._create_ingestion_document(provider, uri_base, args.profile_path, thresholds, fetcher_threads) msg = "Ingestion document %s created." % ingestion_document_id logger.debug(msg) print msg return ingestion_document_id
def nara_update_links(): couch = Couch() url = "http://research.archives.gov/description/" docs = [] print >> sys.stderr, "Fetching all documents" count = 0 start = time.time() for doc in couch._query_all_dpla_provider_docs("nara"): if count == 0: view_time = time.time() - start start = time.time() count += 1 arc_id_desc = getprop(doc, "originalRecord/arc-id-desc", keyErrorAsNone=True) if arc_id_desc: doc.update({"isShownAt": url + arc_id_desc}) docs.append(doc) # POST every 1000 documents if len(docs) == 1000: print >> sys.stderr, "Processed %s documents" % count couch._bulk_post_to(couch.dpla_db, docs) docs = [] # Last POST if docs: print >> sys.stderr, "Processed %s documents" % count couch.bulk_post_to(couch.dpla_db, docs) process_time = time.time() - start print >> sys.stderr, "Done" print >> sys.stderr, "View time: %s" % view_time print >> sys.stderr, "Process time: %s" % process_time
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() i_doc = couch.dashboard_db[args.ingestion_document_id] if i_doc['delete_process']['status'] != 'complete': print >> sys.stderr, 'Error: delete process did not complete' return 1 # Update ingestion document to indicate that we're running kwargs = {'check_counts_process/status': 'running', 'check_counts_process/start_time': iso_utc_with_tz()} try: couch.update_ingestion_doc(i_doc, **kwargs) except: tb = traceback.format_exc(5) print "Error updating ingestion document %s\n%s" % (i_doc["_id"], tb) return 1 error_msg = None try: config = ConfigParser.ConfigParser() config.readfp(open('akara.ini')) to = [s.strip() for s in config.get('Alert', 'To').split(',')] frm = config.get('Alert', 'From') body = "%s\n\n%s" % (alerts(i_doc), statistics(i_doc)) msg = MIMEText(body) msg['Subject'] = "%s ingest #%s" % (i_doc['provider'], i_doc['ingestionSequence']) msg['To'] = ', '.join(to) msg['From'] = frm s = smtplib.SMTP("localhost") s.sendmail(frm, to, msg.as_string()) s.quit() except Exception, e: error_msg = e tb = traceback.format_exc(5) print >> sys.stderr, "Error sending alert email: %s\n%s" % (error_msg, tb)
def main(argv): print "WARNING: Bulk data is now exported/maintained using elasticdump." print "See https://github.com/dpla/automation/blob/develop/ansible/roles/exporter/files/export-provider.sh" parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] if getprop(ingestion_doc, "dashboard_cleanup_process/status") != "complete": print "Error, dashboard cleanup process did not complete" return -1 # Update ingestion document kwargs = { "upload_bulk_data_process/status": "running", "upload_bulk_data_process/start_time": iso_utc_with_tz(), "upload_bulk_data_process/end_time": None, "upload_bulk_data_process/error": None, } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 # TODO: as in the fetch_records.py script, we need profile in this scope # and the file shouldn't have to be opened again with open(ingestion_doc["profile_path"], "r") as profile: contributor = getprop(json.load(profile), "contributor/name") resp = export_database.main([None, "source", contributor, "upload"]) if resp == -1: status = "error" error_msg = "Error uploading bulk data" else: status = "complete" error_msg = None # Update ingestion document kwargs = { "upload_bulk_data_process/status": status, "upload_bulk_data_process/error": error_msg, "upload_bulk_data_process/end_time": iso_utc_with_tz() } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 return 0 if status == "complete" else -1
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] if getprop(ingestion_doc, "save_process/status") != "complete": print "Error, save process did not complete" return -1 # Update ingestion document kwargs = { "delete_process/status": "running", "delete_process/start_time": iso_utc_with_tz() } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 resp, total_deleted = couch.process_deleted_docs(ingestion_doc) if resp == -1: status = "error" error_msg = "Error deleting documents; only %s deleted" % total_deleted else: status = "complete" error_msg = None msg = "Total documents deleted: %s" % total_deleted print msg logger.info(msg) # Update ingestion document kwargs = { "delete_process/status": status, "delete_process/error": error_msg, "delete_process/end_time": iso_utc_with_tz() } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 return 0 if status == "complete" else -1
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] if getprop(ingestion_doc, "save_process/status") != "complete": print "Error, save process did not complete" return -1 # Update ingestion document kwargs = { "delete_process/status": "running", "delete_process/start_time": datetime.now().isoformat() } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 resp, total_deleted = couch.process_deleted_docs(ingestion_doc) if resp == -1: status = "error" error_msg = "Error deleting documents; only %s deleted" % total_deleted else: status = "complete" error_msg = None msg = "Total documents deleted: %s" % total_deleted print msg logger.info(msg) # Update ingestion document kwargs = { "delete_process/status": status, "delete_process/error": error_msg, "delete_process/end_time": datetime.now().isoformat() } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 return 0 if status == "complete" else -1
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] if getprop(ingestion_doc, "check_counts_process/status") != "complete": print "Error, checkk counts process did not complete" return -1 # Update ingestion document kwargs = { "dashboard_cleanup_process/status": "running", "dashboard_cleanup_process/start_time": iso_utc_with_tz(), "dashboard_cleanup_process/end_time": None, "dashboard_cleanup_process/error": None } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 resp, total_deleted = couch.dashboard_cleanup(ingestion_doc) if resp == -1: status = "error" error_msg = "Error deleting documents; only %s deleted" % total_deleted else: status = "complete" error_msg = None print "Total dashboard documents deleted: %s" % total_deleted # Update ingestion document kwargs = { "dashboard_cleanup_process/status": status, "dashboard_cleanup_process/error": error_msg, "dashboard_cleanup_process/end_time": iso_utc_with_tz() } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 return 0 if status == "complete" else -1
sitemap_path = CONFIG.get("Sitemap", "SitemapPath") # Compress previous directory for item in os.listdir(sitemap_path): item_path = os.path.join(sitemap_path, item) if os.path.isdir(item_path): with tarfile.open(item_path + ".tar.gz", "w:gz") as tar: tar.add(item_path, arcname=os.path.basename(item_path)) shutil.rmtree(item_path) # Create new directory new_dir = os.path.join(sitemap_path, date.today().strftime("%Y%m%d")) os.mkdir(new_dir) # Fetch all item URLs c = Couch() urls = [] limit = 50000 count = 1 for doc in c._query_all_docs(c.dpla_db): if doc.get("ingestType") == "item": # Handle older ingestDates, which do not have timezone info. lm_dt = dateutil_parse(doc["ingestDate"]) if lm_dt.utcoffset() is not None: lm = lm_dt.isoformat() else: lm = lm_dt.isoformat() + "Z" urls.append({"loc": "http://dp.la/item/" + doc["id"], "lastmod": lm}) if len(urls) == limit: create_sitemap_files(new_dir, urls, count) count += 1
def main(argv=None, couch=None, provider_name=None): # For testing, couch and provider_name will be provided as params if couch: provider_name = provider_name else: couch = Couch() parser = define_arguments() args = parser.parse_args(argv[1:]) provider_name = args.provider_name provider_legacy_docs = couch._query_all_dpla_provider_docs(provider_name) ingest_docs = couch._query_all_provider_ingestion_docs(provider_name) # Proceed only if there are no ingestion documents for the provider but # there are provider_legacy_docs. proceed = True if len(ingest_docs) > 0: num = len(ingest_docs) print >> sys.stderr, "Error: %s ingestion document(s) exists" % num proceed = False try: next_item = next(couch._query_all_dpla_provider_docs(provider_name)) except: print >> sys.stderr, "Error: No documents found for %s" % provider_name proceed = False def _post(dpla_docs, dashboard_docs, ingest_doc): couch._bulk_post_to(couch.dpla_db, dpla_docs) couch._bulk_post_to(couch.dashboard_db, dashboard_docs) couch._update_ingestion_doc_counts(ingest_doc, countAdded=len(dashboard_docs)) if proceed: ingest_doc_id = couch.create_ingestion_doc_and_backup_db(provider_name) ingest_doc = couch.dashboard_db[ingest_doc_id] docs = [] added_docs = [] print >> sys.stderr, "Fetching all docs..." count = 0 for doc in provider_legacy_docs: count += 1 doc["ingestionSequence"] = 1 docs.append(doc) added_docs.append({"id": doc["_id"], "type": "record", "status": "added", "provider": provider_name, "ingestionSequence": 1}) # POST every 1000 if len(docs) == 1000: print >> sys.stderr, "Processed %s docs" % count _post(docs, added_docs, ingest_doc) # Reset docs = [] added_docs = [] # Last POST if docs: print >> sys.stderr, "Processed %s docs" % count _post(docs, added_docs, ingest_doc) print >> sys.stderr, "Complete"
def print_all_sources(arguments): """Print all source names""" couch = Couch() rows = couch.dpla_view("export_database/all_source_names", group=True) for row in rows: print "%(key)s (count: %(value)d)" % dict(row)
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] # Update ingestion document fetch_dir = create_fetch_dir(ingestion_doc["provider"]) kwargs = { "fetch_process/status": "running", "fetch_process/data_dir": fetch_dir, "fetch_process/start_time": datetime.now().isoformat() } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: logger.error("Error updating ingestion doc %s in %s" % (ingestion_doc["_id"], __name__)) return -1 error_msg = [] fetcher = create_fetcher(ingestion_doc["profile_path"], ingestion_doc["uri_base"]) print "Fetching records for " + fetcher.provider total_fetched_records = 0 for response in fetcher.fetch_all_data(): if response["error"]: error_msg.extend(iterify(response["error"])) print response["error"] else: # Write records to file filename = os.path.join(fetch_dir, str(uuid.uuid4())) with open(filename, "w") as f: f.write(json.dumps(response["data"])) print "Records written to " + filename total_fetched_records += len(getprop(response, "data/records")) logger.info("Total records fetched: %s" % total_fetched_records) # Update ingestion document try: os.rmdir(fetch_dir) # Error if fetch_dir was empty status = "error" error_msg.append("Error, no records fetched") logger.error(error_msg) except: status = "complete" kwargs = { "fetch_process/status": status, "fetch_process/error": error_msg, "fetch_process/end_time": datetime.now().isoformat() } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: logger.error("Error updating ingestion doc %s in %s" % (ingestion_doc["_id"], __name__)) return -1 return 0 if status == "complete" else -1
def main(): couch = Couch() couch._sync_views()
with open(args.profile_path, "r") as f: try: profile = json.load(f) except Exception, e: print "Error reading profile: %s" % e return False if args.pipeline in profile: pipeline = ",".join(profile[args.pipeline]) else: pipeline = args.pipeline provider = profile.get(u"name") contributor = profile.get(u"contributor", {}) # Create ingestion document couch = Couch() ingestion_doc_id = couch.create_ingestion_doc_and_backup_db(provider) # Fetch provider documents docs = [] count = 0 for doc in couch._query_all_dpla_provider_docs(provider): docs.append(doc) count += 1 # Enrich in batches of 1000 if len(docs) == 1000: enriched_docs = enrich(docs, args.uri_base, pipeline) couch.process_and_post_to_dpla(enriched_docs, ingestion_doc_id) print "Enriched %s documents" % count docs = [] # Enrich last batch
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] if getprop(ingestion_doc, "fetch_process/status") != "complete": print "Cannot enrich, fetch process did not complete" return -1 # Update ingestion document status = "running" enrich_dir = create_enrich_dir(ingestion_doc["provider"]) kwargs = { "enrich_process/status": status, "enrich_process/data_dir": enrich_dir, "enrich_process/start_time": datetime.now().isoformat(), "enrich_process/end_time": None, "enrich_process/error": None, "enrich_process/total_items": None, "enrich_process/total_collections": None, "enrich_process/missing_id": None, "enrich_process/missing_source_resource": None, } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 # Set the headers sent with the enrich request with open(ingestion_doc["profile_path"], "r") as f: profile = json.loads(f.read()) headers = { "Source": ingestion_doc["provider"], "Content-Type": "application/json", "Pipeline-Item": ",".join(profile["enrichments_item"]), "Pipeline-Coll": ",".join(profile["enrichments_coll"]), } errors = [] fetch_dir = getprop(ingestion_doc, "fetch_process/data_dir") # Counts for logger info enriched_items = 0 enriched_colls = 0 missing_id = 0 missing_source_resource = 0 file_count = 0 files = os.listdir(fetch_dir) for filename in files: file_count += 1 filepath = os.path.join(fetch_dir, filename) with open(filepath, "r") as f: try: data = json.loads(f.read()) except: errors.append("Error loading " + filepath) break # Enrich print "Enriching file %s (%s of %s)" % (filepath, file_count, len(files)) enrich_path = ingestion_doc["uri_base"] + "/enrich" resp, content = H.request(enrich_path, "POST", body=json.dumps(data), headers=headers) if not resp["status"].startswith("2"): errors.append("Error (status %s) enriching data from %s" % (resp["status"], filepath)) print "Stopped enrichment process: %s" % errors status = "error" break data = json.loads(content) enriched_records = data["enriched_records"] # Update counts enriched_items += data["enriched_item_count"] enriched_colls += data["enriched_coll_count"] missing_id += data["missing_id_count"] missing_source_resource += data["missing_source_resource_count"] errors.extend(data["errors"]) # Write enriched data to file with open(os.path.join(enrich_dir, filename), "w") as f: f.write(json.dumps(enriched_records)) print "Enriched items: %s" % enriched_items print "Enriched collections: %s" % enriched_colls print "Missing ID: %s" % missing_id print "Missing sourceResource: %s" % missing_source_resource # Update ingestion document if not status == "error": status = "complete" kwargs = { "enrich_process/status": status, "enrich_process/error": errors, "enrich_process/end_time": datetime.now().isoformat(), "enrich_process/total_items": enriched_items, "enrich_process/total_collections": enriched_colls, "enrich_process/missing_id": missing_id, "enrich_process/missing_source_resource": missing_source_resource, } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 # Compress fetch directory, then delete # make_tarfile(fetch_dir) # shutil.rmtree(fetch_dir) return 0 if status == "complete" else -1
with open(args.profile_path, "r") as f: try: profile = json.load(f) except Exception, e: print "Error reading profile: %s" % e return False if args.pipeline in profile: pipeline = ",".join(profile[args.pipeline]) else: pipeline = args.pipeline provider = profile.get(u"name") contributor = profile.get(u"contributor", {}) # Create ingestion document couch = Couch() ingestion_doc_id = create_ingestion_document.main([None, args.profile_path]) ingestion_doc = couch.dashboard_db[ingestion_doc_id] # Update ingestion document kwargs = { "poll_storage_process/status": "running", "poll_storage_process/start_time": iso_utc_with_tz(), "poll_storage_process/end_time": None, } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return
def main(argv): global threads_working parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] fetch_dir = getprop(ingestion_doc, "fetch_process/data_dir") enrich_dir = create_enrich_dir(ingestion_doc["provider"]) if getprop(ingestion_doc, "fetch_process/status") != "complete": print >> sys.stderr, "Cannot enrich, fetch process did not complete" return 1 # Update ingestion document status = "running" kwargs = { "enrich_process/status": status, "enrich_process/data_dir": enrich_dir, "enrich_process/start_time": iso_utc_with_tz(), "enrich_process/end_time": None, "enrich_process/error": None, "enrich_process/total_items": None, "enrich_process/total_collections": None, "enrich_process/missing_id": None, "enrich_process/missing_source_resource": None } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print >> sys.stderr, "Error updating ingestion document " + \ ingestion_doc["_id"] return 1 with open(ingestion_doc["profile_path"], "r") as f: profile = json.loads(f.read()) # Counts for logger info stats = {'enriched_items': 0, 'enriched_colls': 0, 'missing_id': 0, 'missing_source_resource': 0 } # Initialize queue and threads queue, thread_errors = queue_and_errors(ingestion_doc=ingestion_doc, profile=profile, stats=stats, enrich_dir=enrich_dir) # Initialize list of input files listing = os.listdir(fetch_dir) # Initialize counters and statistics dashboard_errors = [] file_count = 0 status = None total_files = len(listing) files = iter(listing) try: # Keep the queue full of filenames while True: time.sleep(0.25) try: if print_errors_thrown(thread_errors): dashboard_errors.extend(thread_errors) raise Exception() if not queue.full(): basename = files.next() filename = os.path.join(fetch_dir, basename) file_count += 1 print "Enqueuing: %s (%s of %s)" % \ (filename, file_count, total_files) queue.put(filename) except StopIteration: break # Wait for queue to be empty before returning while True: if queue.empty() and not threads_working: if not print_errors_thrown(thread_errors): break else: dashboard_errors.extend(thread_errors) raise Exception() time.sleep(0.25) except KeyboardInterrupt: status = "error" msg = "\nCaught keyboard interrupt" print >> sys.stderr, msg dashboard_errors.append(msg) except Exception as e: if e.message: print >> sys.stderr, e.message status = "error" dashboard_errors.append(e.message) finally: print "Enriched items: %s" % stats['enriched_items'] print "Enriched collections: %s" % stats['enriched_colls'] print "Missing ID: %s" % stats['missing_id'] print "Missing sourceResource: %s" % stats['missing_source_resource'] if not status == "error": status = "complete" # Prepare fields for ingestion document update couch_kwargs = { "enrich_process/status": status, "enrich_process/error": dashboard_errors, "enrich_process/end_time": iso_utc_with_tz(), "enrich_process/total_items": stats['enriched_items'], "enrich_process/total_collections": stats['enriched_colls'], "enrich_process/missing_id": stats['missing_id'], "enrich_process/missing_source_resource": \ stats['missing_source_resource'] } try: # Update ingestion document couch.update_ingestion_doc(ingestion_doc, **couch_kwargs) except: print >> sys.stderr, "Error updating ingestion document " + \ ingestion_doc["_id"] return 1 return 0 if status == "complete" else 1
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] if getprop(ingestion_doc, "fetch_process/status") != "complete": print "Cannot enrich, fetch process did not complete" return -1 # Update ingestion document enrich_dir = create_enrich_dir(ingestion_doc["provider"]) kwargs = { "enrich_process/status": "running", "enrich_process/data_dir": enrich_dir, "enrich_process/start_time": datetime.now().isoformat() } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 # Set the headers sent with the enrich request with open(ingestion_doc["profile_path"], "r") as f: profile = json.loads(f.read()) headers = { "Source": ingestion_doc["provider"], "Collection": "", "Content-Type": "application/json", "Pipeline-Rec": ",".join(profile["enrichments_rec"]), "Pipeline-Coll": ",".join(profile["enrichments_coll"]) } error_msg = None fetch_dir = getprop(ingestion_doc, "fetch_process/data_dir") total_enriched_records = 0 for filename in os.listdir(fetch_dir): filepath = os.path.join(fetch_dir, filename) with open(filepath, "r") as f: try: data = json.loads(f.read()) except: error_msg = "Error loading " + filepath break # Enrich print "Enriching file " + filepath enrich_path = ingestion_doc["uri_base"] + "/enrich" resp, content = H.request(enrich_path, "POST", body=json.dumps(data), headers=headers) if not resp["status"].startswith("2"): error_msg = "Error (status %s) enriching data from %s" % \ (resp["status"], filepath) print "Stopped enrichment process: " + error_msg break data = json.loads(content) enriched_records = data["enriched_records"] total_enriched_records += data["enriched_records_count"] # Write enriched data to file with open(os.path.join(enrich_dir, filename), "w") as f: f.write(json.dumps(enriched_records)) logger.info("Total records enriched: %s" % total_enriched_records) # Update ingestion document if error_msg is not None: status = "error" else: status = "complete" kwargs = { "enrich_process/status": status, "enrich_process/error": error_msg, "enrich_process/end_time": datetime.now().isoformat() } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 # Compress fetch directory, then delete make_tarfile(fetch_dir) shutil.rmtree(fetch_dir) return 0 if status == "complete" else -1
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] # Update ingestion document fetch_dir = create_fetch_dir(ingestion_doc["provider"]) kwargs = { "fetch_process/status": "running", "fetch_process/data_dir": fetch_dir, "fetch_process/start_time": datetime.now().isoformat(), "fetch_process/end_time": None, "fetch_process/error": None, "fetch_process/total_items": None, "fetch_process/total_collections": None, } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: logger.error("Error updating ingestion doc %s in %s" % (ingestion_doc["_id"], __name__)) return -1 error_msg = [] config_file = "akara.ini" fetcher = create_fetcher(ingestion_doc["profile_path"], ingestion_doc["uri_base"], config_file) print "Fetching records for " + fetcher.provider total_items = 0 total_collections = 0 for response in fetcher.fetch_all_data(): if response["errors"]: error_msg.extend(iterify(response["errors"])) print response["errors"] if response["records"]: # Write records to file filename = os.path.join(fetch_dir, str(uuid.uuid4())) with open(filename, "w") as f: f.write(json.dumps(response["records"])) items = len([record for record in response["records"] if not record.get("ingestType") == "collection"]) total_items += items total_collections += len(response["records"]) - items print "Total items: %s" % total_items print "Total collections: %s" % total_collections # Update ingestion document try: os.rmdir(fetch_dir) # Error if fetch_dir was empty status = "error" error_msg.append("Error, no records fetched") logger.error(error_msg) except: status = "complete" kwargs = { "fetch_process/status": status, "fetch_process/error": error_msg, "fetch_process/end_time": datetime.now().isoformat(), "fetch_process/total_items": total_items, "fetch_process/total_collections": total_collections, } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: logger.error("Error updating ingestion doc %s in %s" % (ingestion_doc["_id"], __name__)) return -1 return 0 if status == "complete" else -1
def main(argv=None, couch=None, provider_name=None): # For testing, couch and provider_name will be provided as params if couch: provider_name = provider_name else: couch = Couch() parser = define_arguments() args = parser.parse_args(argv[1:]) provider_name = args.provider_name provider_legacy_docs = couch._query_all_dpla_provider_docs(provider_name) ingest_docs = couch._query_all_provider_ingestion_docs(provider_name) # Proceed only if there are no ingestion documents for the provider but # there are provider_legacy_docs. proceed = True if len(ingest_docs) > 0: num = len(ingest_docs) print >> sys.stderr, "Error: %s ingestion document(s) exists" % num proceed = False try: next_item = next(couch._query_all_dpla_provider_docs(provider_name)) except: print >> sys.stderr, "Error: No documents found for %s" % provider_name proceed = False def _post(dpla_docs, dashboard_docs, ingest_doc_id): couch._bulk_post_to(couch.dpla_db, dpla_docs) couch._bulk_post_to(couch.dashboard_db, dashboard_docs) couch._update_ingestion_doc_counts(ingest_doc_id, countAdded=len(dashboard_docs)) if proceed: ingest_doc_id = couch.create_ingestion_doc_and_backup_db(provider_name) docs = [] added_docs = [] print >> sys.stderr, "Fetching all docs..." count = 0 for doc in provider_legacy_docs: count += 1 doc["ingestionSequence"] = 1 docs.append(doc) added_docs.append({ "id": doc["_id"], "type": "record", "status": "added", "provider": provider_name, "ingestionSequence": 1 }) # POST every 1000 if len(docs) == 1000: print >> sys.stderr, "Processed %s docs" % count _post(docs, added_docs, ingest_doc_id) # Reset docs = [] added_docs = [] # Last POST if docs: print >> sys.stderr, "Processed %s docs" % count _post(docs, added_docs, ingest_doc_id) print >> sys.stderr, "Complete"
with open(args.profile_path, "r") as f: try: profile = json.load(f) except Exception, e: print "Error reading profile: %s" % e return False if args.pipeline in profile: pipeline = ",".join(profile[args.pipeline]) else: pipeline = args.pipeline provider = profile.get(u"name") contributor = profile.get(u"contributor", {}) # Create ingestion document couch = Couch() ingestion_doc_id = create_ingestion_document.main([None, args.profile_path]) ingestion_doc = couch.dashboard_db[ingestion_doc_id] ingestion_doc["poll_storage_process"] = {"status": "running"} couch.dashboard_db.update([ingestion_doc]) # Fetch provider documents docs = [] count = 0 for doc in couch._query_all_dpla_provider_docs(provider): docs.append(doc) count += 1 # Enrich in batches of 1000 if len(docs) == 1000: enriched_docs = enrich(docs, args.uri_base, pipeline)
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] # TODO: profile should be passed to create_fetcher, below. # We need profile in this scope, and the file shouldn't have to # be opened again by create_fetcher. with open(ingestion_doc["profile_path"], "r") as f: profile = json.load(f) # Update ingestion document fetch_dir = create_fetch_dir(ingestion_doc["provider"]) kwargs = { "fetch_process/status": "running", "fetch_process/data_dir": fetch_dir, "fetch_process/start_time": iso_utc_with_tz(), "fetch_process/end_time": None, "fetch_process/error": None, "fetch_process/total_items": None, "fetch_process/total_collections": None } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: logger.error("Error updating ingestion doc %s in %s" % (ingestion_doc["_id"], __name__)) return -1 error_msg = [] config_file = os.environ.get("DPLA_CONFIG_FILE", "akara.ini") fetcher = create_fetcher(ingestion_doc["profile_path"], ingestion_doc["uri_base"], config_file) print "Fetching records for %s" % ingestion_doc["provider"] stats = { "total_items": 0, "total_collections": 0 } try: threads = int(profile.get("fetcher_threads")) or 1 print "Threads: %d" % threads except: print >> sys.stderr, "Can not determine fetcher threads, so using 1" threads = 1 sets = None sets_supported = (profile.get("sets") != "NotSupported") if threads > 1 and sets_supported and hasattr(fetcher, "fetch_sets"): sets_err, s = fetcher.fetch_sets() if s: sets = iter(s) else: print >> sys.stderr, "Could not get sets: ", sets_err return -1 queue, th_errors, d_errors = queue_and_errors(threads, ingestion_doc, config_file, fetch_dir, stats) status = None try: while True: time.sleep(0.1) try: if print_errors_thrown(th_errors): d_errors.extend(th_errors) raise Exception() if not queue.full(): queue.put(sets.next()) except StopIteration: break # Wait for queue to be empty before returning while True: if queue.empty() and not threads_working: if not print_errors_thrown(th_errors): break else: d_errors.extend(th_errors) raise Exception() time.sleep(0.1) except KeyboardInterrupt: status = "error" msg = "\nCaught keyboard interrupt" print >> sys.stderr, msg d_errors.append(msg) except Exception as e: if e.message: print >> sys.stderr, e.message status = "error" d_errors.append(e.message) finally: if not status == "error": status = "complete" else: # not threads rv = fetch_all_for_set(None, fetcher, fetch_dir) stats["total_items"] += rv["total_items"] stats["total_collections"] += rv["total_collections"] error_msg += rv["errors"] print "Total items: %s" % stats["total_items"] print "Total collections: %s" % stats["total_collections"] # Update ingestion document try: os.rmdir(fetch_dir) # Error if fetch_dir was empty status = "error" error_msg.append("Error, no records fetched") logger.error(error_msg) except: status = "complete" kwargs = { "fetch_process/status": status, "fetch_process/error": error_msg, "fetch_process/end_time": iso_utc_with_tz(), "fetch_process/total_items": stats["total_items"], "fetch_process/total_collections": stats["total_collections"] } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: logger.error("Error updating ingestion doc %s in %s" % (ingestion_doc["_id"], __name__)) return -1 return 0 if status == "complete" else -1
with open(args.profile_path, "r") as f: try: profile = json.load(f) except Exception, e: print "Error reading profile: %s" % e return False if args.pipeline in profile: pipeline = ",".join(profile[args.pipeline]) else: pipeline = args.pipeline provider = profile.get(u"name") contributor = profile.get(u"contributor", {}) # Create ingestion document couch = Couch() ingestion_doc_id = create_ingestion_document.main([None, args.profile_path]) ingestion_doc = couch.dashboard_db[ingestion_doc_id] # Update ingestion document kwargs = { "poll_storage_process/status": "running", "poll_storage_process/start_time": datetime.now().isoformat(), "poll_storage_process/end_time": None, } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return
def main(argv): global threads_working parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] fetch_dir = getprop(ingestion_doc, "fetch_process/data_dir") enrich_dir = create_enrich_dir(ingestion_doc["provider"]) if getprop(ingestion_doc, "fetch_process/status") != "complete": print >> sys.stderr, "Cannot enrich, fetch process did not complete" return 1 # Update ingestion document status = "running" kwargs = { "enrich_process/status": status, "enrich_process/data_dir": enrich_dir, "enrich_process/start_time": iso_utc_with_tz(), "enrich_process/end_time": None, "enrich_process/error": None, "enrich_process/total_items": None, "enrich_process/total_collections": None, "enrich_process/missing_id": None, "enrich_process/missing_source_resource": None } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print >> sys.stderr, "Error updating ingestion document " + \ ingestion_doc["_id"] return 1 with open(ingestion_doc["profile_path"], "r") as f: profile = json.loads(f.read()) # Counts for logger info stats = { 'enriched_items': 0, 'enriched_colls': 0, 'missing_id': 0, 'missing_source_resource': 0 } # Initialize queue and threads queue, thread_errors = queue_and_errors(ingestion_doc=ingestion_doc, profile=profile, stats=stats, enrich_dir=enrich_dir) # Initialize list of input files listing = os.listdir(fetch_dir) # Initialize counters and statistics dashboard_errors = [] file_count = 0 status = None total_files = len(listing) files = iter(listing) try: # Keep the queue full of filenames while True: time.sleep(0.25) try: if print_errors_thrown(thread_errors): dashboard_errors.extend(thread_errors) raise Exception() if not queue.full(): basename = files.next() filename = os.path.join(fetch_dir, basename) file_count += 1 print "Enqueuing: %s (%s of %s)" % \ (filename, file_count, total_files) queue.put(filename) except StopIteration: break # Wait for queue to be empty before returning while True: if queue.empty() and not threads_working: if not print_errors_thrown(thread_errors): break else: dashboard_errors.extend(thread_errors) raise Exception() time.sleep(0.25) except KeyboardInterrupt: status = "error" msg = "\nCaught keyboard interrupt" print >> sys.stderr, msg dashboard_errors.append(msg) except Exception as e: if e.message: print >> sys.stderr, e.message status = "error" dashboard_errors.append(e.message) finally: print "Enriched items: %s" % stats['enriched_items'] print "Enriched collections: %s" % stats['enriched_colls'] print "Missing ID: %s" % stats['missing_id'] print "Missing sourceResource: %s" % stats['missing_source_resource'] if not status == "error": status = "complete" # Prepare fields for ingestion document update couch_kwargs = { "enrich_process/status": status, "enrich_process/error": dashboard_errors, "enrich_process/end_time": iso_utc_with_tz(), "enrich_process/total_items": stats['enriched_items'], "enrich_process/total_collections": stats['enriched_colls'], "enrich_process/missing_id": stats['missing_id'], "enrich_process/missing_source_resource": \ stats['missing_source_resource'] } try: # Update ingestion document couch.update_ingestion_doc(ingestion_doc, **couch_kwargs) except: print >> sys.stderr, "Error updating ingestion document " + \ ingestion_doc["_id"] return 1 return 0 if status == "complete" else 1
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) batch_size = 500 couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] if getprop(ingestion_doc, "enrich_process/status") != "complete": print "Cannot save, enrich process did not complete" return -1 # Update ingestion document kwargs = { "save_process/status": "running", "save_process/start_time": iso_utc_with_tz(), "save_process/end_time": None, "save_process/error": None, "save_process/total_saved": None } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 # Back up provider data if args.backup: resp = couch._back_up_data(ingestion_doc) if resp == -1: # Fatal error, do not continue with save process kwargs = { "save_process/status": "error", "save_process/end_time": iso_utc_with_tz(), "save_process/error": "Error backing up DPLA records" } couch.update_ingestion_doc(ingestion_doc, **kwargs) return resp error_msg = None enrich_dir = getprop(ingestion_doc, "enrich_process/data_dir") total_items = 0 total_collections = 0 sync_point = 5000 docs = {} for file in os.listdir(enrich_dir): filename = os.path.join(enrich_dir, file) with open(filename, "r") as f: try: file_docs = json.loads(f.read()) except: error_msg = "Error loading " + filename break # Save when docs is about to exceed the batch size print >> sys.stderr, "Read file %s" % filename if docs and len(docs) + len(file_docs) > batch_size: resp, error_msg = couch.process_and_post_to_dpla(docs, ingestion_doc) if resp == -1: docs = None break items = len([doc for doc in docs.values() if doc.get("ingestType") == "item"]) total_items += items total_collections += len(docs) - items print "Saved %s documents" % (total_items + total_collections) #if total_items > sync_point: # print "Syncing views" # couch.sync_views(couch.dpla_db.name) # sync_point = total_items + 10000 # Set docs for the next iteration docs = file_docs else: docs.update(file_docs) # Last save if docs: resp, error_msg = couch.process_and_post_to_dpla(docs, ingestion_doc) if resp != -1: items = len([doc for doc in docs.values() if doc.get("ingestType") == "item"]) total_items += items total_collections += len(docs) - items print "Saved %s documents" % (total_items + total_collections) #print "Syncing views" #couch.sync_views(couch.dpla_db.name) print "Total items: %s" % total_items print "Total collections: %s" % total_collections if error_msg: status = "error" else: status = "complete" kwargs = { "save_process/status": status, "save_process/error": error_msg, "save_process/end_time": iso_utc_with_tz(), "save_process/total_items": total_items, "save_process/total_collections": total_collections } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 # Compress enrich dir, then delete make_tarfile(enrich_dir) shutil.rmtree(enrich_dir) return total_items if status == "complete" else -1
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] # TODO: profile should be passed to create_fetcher, below. # We need profile in this scope, and the file shouldn't have to # be opened again by create_fetcher. with open(ingestion_doc["profile_path"], "r") as f: profile = json.load(f) # Update ingestion document fetch_dir = create_fetch_dir(ingestion_doc["provider"]) kwargs = { "fetch_process/status": "running", "fetch_process/data_dir": fetch_dir, "fetch_process/start_time": iso_utc_with_tz(), "fetch_process/end_time": None, "fetch_process/error": None, "fetch_process/total_items": None, "fetch_process/total_collections": None } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: logger.error("Error updating ingestion doc %s in %s" % (ingestion_doc["_id"], __name__)) return -1 error_msg = [] config_file = "akara.ini" fetcher = create_fetcher(ingestion_doc["profile_path"], ingestion_doc["uri_base"], config_file) print "Fetching records for %s" % ingestion_doc["provider"] stats = { "total_items": 0, "total_collections": 0 } try: threads = int(profile.get("fetcher_threads")) or 1 print "Threads: %d" % threads except: print >> sys.stderr, "Can not determine fetcher threads, so using 1" threads = 1 sets = None sets_supported = (profile.get("sets") != "NotSupported") if threads > 1 and sets_supported and hasattr(fetcher, "fetch_sets"): sets_err, s = fetcher.fetch_sets() if s: sets = iter(s) else: print >> sys.stderr, "Could not get sets: ", sets_err return -1 queue, th_errors, d_errors = queue_and_errors(threads, ingestion_doc, config_file, fetch_dir, stats) status = None try: while True: time.sleep(0.1) try: if print_errors_thrown(th_errors): d_errors.extend(th_errors) raise Exception() if not queue.full(): queue.put(sets.next()) except StopIteration: break # Wait for queue to be empty before returning while True: if queue.empty() and not threads_working: if not print_errors_thrown(th_errors): break else: d_errors.extend(th_errors) raise Exception() time.sleep(0.1) except KeyboardInterrupt: status = "error" msg = "\nCaught keyboard interrupt" print >> sys.stderr, msg d_errors.append(msg) except Exception as e: if e.message: print >> sys.stderr, e.message status = "error" d_errors.append(e.message) finally: if not status == "error": status = "complete" else: # not threads rv = fetch_all_for_set(None, fetcher, fetch_dir) stats["total_items"] += rv["total_items"] stats["total_collections"] += rv["total_collections"] error_msg += rv["errors"] print "Total items: %s" % stats["total_items"] print "Total collections: %s" % stats["total_collections"] # Update ingestion document try: os.rmdir(fetch_dir) # Error if fetch_dir was empty status = "error" error_msg.append("Error, no records fetched") logger.error(error_msg) except: status = "complete" kwargs = { "fetch_process/status": status, "fetch_process/error": error_msg, "fetch_process/end_time": iso_utc_with_tz(), "fetch_process/total_items": stats["total_items"], "fetch_process/total_collections": stats["total_collections"] } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: logger.error("Error updating ingestion doc %s in %s" % (ingestion_doc["_id"], __name__)) return -1 return 0 if status == "complete" else -1
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) config = ConfigParser.ConfigParser() config.readfp(open("akara.ini")) batch_size = int(config.get("CouchDb", "BatchSize")) couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] if getprop(ingestion_doc, "enrich_process/status") != "complete": print "Cannot save, enrich process did not complete" return -1 # Update ingestion document kwargs = { "save_process/status": "running", "save_process/start_time": datetime.now().isoformat(), "save_process/end_time": None, "save_process/error": None, "save_process/total_saved": None } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 # Back up provider data if args.backup: resp = couch._back_up_data(ingestion_doc) if resp == -1: # Fatal error, do not continue with save process kwargs = { "save_process/status": "error", "save_process/end_time": datetime.now().isoformat(), "save_process/error": "Error backing up DPLA records" } couch.update_ingestion_doc(ingestion_doc, **kwargs) return resp error_msg = None enrich_dir = getprop(ingestion_doc, "enrich_process/data_dir") total_items = 0 total_collections = 0 docs = {} for file in os.listdir(enrich_dir): filename = os.path.join(enrich_dir, file) with open(filename, "r") as f: try: file_docs = json.loads(f.read()) except: error_msg = "Error loading " + filename break # Save when docs is about to exceed the batch size print >> sys.stderr, "Read file %s" % filename if docs and len(docs) + len(file_docs) > batch_size: resp, error_msg = couch.process_and_post_to_dpla(docs, ingestion_doc) if resp == -1: docs = None break items = len([doc for doc in docs.values() if doc.get("ingestType") == "item"]) total_items += items total_collections += len(docs) - items print "Saved %s documents" % (total_items + total_collections) # Set docs for the next iteration docs = file_docs else: docs.update(file_docs) # Last save if docs: resp, error_msg = couch.process_and_post_to_dpla(docs, ingestion_doc) if resp != -1: items = len([doc for doc in docs.values() if doc.get("ingestType") == "item"]) total_items += items total_collections += len(docs) - items print "Saved %s documents" % (total_items + total_collections) print "Total items: %s" % total_items print "Total collections: %s" % total_collections if error_msg: status = "error" else: status = "complete" kwargs = { "save_process/status": status, "save_process/error": error_msg, "save_process/end_time": datetime.now().isoformat(), "save_process/total_items": total_items, "save_process/total_collections": total_collections } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 # Compress enrich dir, then delete make_tarfile(enrich_dir) shutil.rmtree(enrich_dir) return 0 if status == "complete" else -1