def test_absolute_url_fetcher_uva1(): profile_path = "profiles/virginia.pjs" fetcher = create_fetcher(profile_path, uri_base) assert fetcher.__class__.__name__ == "UVAFetcher" for response in fetcher.fetch_all_data(): assert response.get("error") is None assert getprop(response, "data/records") is not None break
def test_absolute_url_fetcher_uva1(): profile_path = "profiles/virginia.pjs" fetcher = create_fetcher(profile_path, uri_base, config_file) assert fetcher.__class__.__name__ == "UVAFetcher" for response in fetcher.fetch_all_data(): assert not response["errors"] assert response["records"] break
def test_file_fetcher_smithsonian(): profile_path = "profiles/smithsonian.pjs" fetcher = create_fetcher(profile_path, uri_base) assert fetcher.__class__.__name__ == "EDANFetcher" fetcher.endpoint_url = "file:/%s/test/test_data/smithsonian/" % os.getcwd() for response in fetcher.fetch_all_data(): assert response.get("error") == [] assert getprop(response, "data/records") is not None break
def test_absolute_url_fetcher_ia(): profile_path = "profiles/ia.pjs" fetcher = create_fetcher(profile_path, uri_base) assert fetcher.__class__.__name__ == "IAFetcher" fetcher.endpoint_url_params["rows"] = 10 for response in fetcher.fetch_all_data(): assert response.get("error") == [] assert getprop(response, "data/records") is not None break
def test_file_fetcher_smithsonian(): profile_path = "profiles/smithsonian.pjs" fetcher = create_fetcher(profile_path, uri_base, config_file) assert fetcher.__class__.__name__ == "EDANFetcher" fetcher.endpoint_url = "file:/%s/test/test_data/smithsonian/" % os.getcwd() for response in fetcher.fetch_all_data(): assert response["errors"] == [] assert response["records"] break
def test_absolute_url_fetcher_ia(): profile_path = "profiles/ia.pjs" fetcher = create_fetcher(profile_path, uri_base, config_file) assert fetcher.__class__.__name__ == "IAFetcher" fetcher.endpoint_url_params["rows"] = 10 for response in fetcher.fetch_all_data(): assert not response["errors"] assert response["records"] break
def test_oai_fetcher_invalid_subresource(): profile_path = "profiles/clemson.pjs" fetcher = create_fetcher(profile_path, uri_base) assert fetcher.__class__.__name__ == "OAIVerbsFetcher" fetcher.subresources = ["banana"] for response in fetcher.fetch_all_data(): assert response.get("error") is not None assert getprop(response, "data/records") is None assert fetcher.subresources.keys() == []
def test_oai_fetcher_invalid_set(): profile_path = "profiles/clemson.pjs" fetcher = create_fetcher(profile_path, uri_base, config_file) assert fetcher.__class__.__name__ == "OAIVerbsFetcher" fetcher.sets = ["banana"] for response in fetcher.fetch_all_data(): assert response["errors"] assert not response["records"] assert fetcher.collections.keys() == []
def test_oai_fetcher_all_subresources(): profile_path = "profiles/clemson.pjs" fetcher = create_fetcher(profile_path, uri_base) assert fetcher.__class__.__name__ == "OAIVerbsFetcher" for response in fetcher.fetch_all_data(): assert response.get("error") is None assert getprop(response, "data/records") is not None diff = [subresource for subresource in scdl_all_subresources if subresource not in fetcher.subresources] assert diff == []
def test_oai_fetcher_all_sets(): profile_path = "profiles/clemson.pjs" fetcher = create_fetcher(profile_path, uri_base, config_file) assert fetcher.__class__.__name__ == "OAIVerbsFetcher" for response in fetcher.fetch_all_data(): assert not response["errors"] assert response["records"] diff = [s for s in scdl_all_sets if s not in fetcher.collections] assert diff == []
def test_oai_fetcher_with_blacklist(): profile_path = "profiles/clemson.pjs" fetcher = create_fetcher(profile_path, uri_base) assert fetcher.__class__.__name__ == "OAIVerbsFetcher" fetcher.blacklist = scdl_blacklist for response in fetcher.fetch_all_data(): pass subresources = list(set(scdl_all_subresources) - set(scdl_blacklist)) diff = [subresource for subresource in subresources if subresource not in fetcher.subresources] assert diff == []
def test_oai_fetcher_with_blacklist(): profile_path = "profiles/clemson.pjs" fetcher = create_fetcher(profile_path, uri_base, config_file) assert fetcher.__class__.__name__ == "OAIVerbsFetcher" fetcher.blacklist = scdl_blacklist for response in fetcher.fetch_all_data(): pass sets = list(set(scdl_all_sets) - set(scdl_blacklist)) diff = [s for s in sets if s not in fetcher.collections] assert diff == []
def test_all_oai_verb_fetchers(): for profile in os.listdir("profiles"): if profile.endswith(".pjs"): profile_path = "profiles/" + profile with open(profile_path, "r") as f: prof = json.loads(f.read()) if prof.get("type") == "oai_verbs": fetcher = create_fetcher(profile_path, uri_base) assert fetcher.__class__.__name__ == "OAIVerbsFetcher" # Digital Commonwealth sets 217, 218 are giving errors if prof.get("name") == "digital-commonwealth": fetcher.blacklist.extend(["217", "218"]) for response in fetcher.fetch_all_data(): assert response.get("error") is None assert getprop(response, "data/records") is not None break
def test_all_oai_verb_fetchers(): for profile in os.listdir("profiles"): if profile.endswith(".pjs"): # David Rumsey ListSets is returning 500 on hz4 and Travis if profile == "david_rumsey.pjs": continue profile_path = "profiles/" + profile with open(profile_path, "r") as f: prof = json.loads(f.read()) if prof.get("type") == "oai_verbs": fetcher = create_fetcher(profile_path, uri_base, config_file) assert fetcher.__class__.__name__ == "OAIVerbsFetcher" # Digital Commonwealth sets 217, 218 are giving errors if prof.get("name") == "digital-commonwealth": fetcher.blacklist.extend(["217", "218"]) for response in fetcher.fetch_all_data(): assert not response["errors"] assert response["records"] break
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] # Update ingestion document fetch_dir = create_fetch_dir(ingestion_doc["provider"]) kwargs = { "fetch_process/status": "running", "fetch_process/data_dir": fetch_dir, "fetch_process/start_time": datetime.now().isoformat(), "fetch_process/end_time": None, "fetch_process/error": None, "fetch_process/total_items": None, "fetch_process/total_collections": None, } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: logger.error("Error updating ingestion doc %s in %s" % (ingestion_doc["_id"], __name__)) return -1 error_msg = [] config_file = "akara.ini" fetcher = create_fetcher(ingestion_doc["profile_path"], ingestion_doc["uri_base"], config_file) print "Fetching records for " + fetcher.provider total_items = 0 total_collections = 0 for response in fetcher.fetch_all_data(): if response["errors"]: error_msg.extend(iterify(response["errors"])) print response["errors"] if response["records"]: # Write records to file filename = os.path.join(fetch_dir, str(uuid.uuid4())) with open(filename, "w") as f: f.write(json.dumps(response["records"])) items = len([record for record in response["records"] if not record.get("ingestType") == "collection"]) total_items += items total_collections += len(response["records"]) - items print "Total items: %s" % total_items print "Total collections: %s" % total_collections # Update ingestion document try: os.rmdir(fetch_dir) # Error if fetch_dir was empty status = "error" error_msg.append("Error, no records fetched") logger.error(error_msg) except: status = "complete" kwargs = { "fetch_process/status": status, "fetch_process/error": error_msg, "fetch_process/end_time": datetime.now().isoformat(), "fetch_process/total_items": total_items, "fetch_process/total_collections": total_collections, } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: logger.error("Error updating ingestion doc %s in %s" % (ingestion_doc["_id"], __name__)) return -1 return 0 if status == "complete" else -1
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] # Update ingestion document fetch_dir = create_fetch_dir(ingestion_doc["provider"]) kwargs = { "fetch_process/status": "running", "fetch_process/data_dir": fetch_dir, "fetch_process/start_time": datetime.now().isoformat() } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: logger.error("Error updating ingestion doc %s in %s" % (ingestion_doc["_id"], __name__)) return -1 error_msg = [] fetcher = create_fetcher(ingestion_doc["profile_path"], ingestion_doc["uri_base"]) print "Fetching records for " + fetcher.provider total_fetched_records = 0 for response in fetcher.fetch_all_data(): if response["error"]: error_msg.extend(iterify(response["error"])) print response["error"] else: # Write records to file filename = os.path.join(fetch_dir, str(uuid.uuid4())) with open(filename, "w") as f: f.write(json.dumps(response["data"])) print "Records written to " + filename total_fetched_records += len(getprop(response, "data/records")) logger.info("Total records fetched: %s" % total_fetched_records) # Update ingestion document try: os.rmdir(fetch_dir) # Error if fetch_dir was empty status = "error" error_msg.append("Error, no records fetched") logger.error(error_msg) except: status = "complete" kwargs = { "fetch_process/status": status, "fetch_process/error": error_msg, "fetch_process/end_time": datetime.now().isoformat() } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: logger.error("Error updating ingestion doc %s in %s" % (ingestion_doc["_id"], __name__)) return -1 return 0 if status == "complete" else -1