Python create_fetcher 예제들, dplaingestion.fetcher.create_fetcher Python 예제들

예제 #1

0

파일 보기

파일: test_fetcher.py 프로젝트: amber-reichert/ingestion

def test_absolute_url_fetcher_uva1():
    profile_path = "profiles/virginia.pjs"
    fetcher =  create_fetcher(profile_path, uri_base)
    assert fetcher.__class__.__name__ == "UVAFetcher"

    for response in fetcher.fetch_all_data():
        assert response.get("error") is None
        assert getprop(response, "data/records") is not None
        break

예제 #2

0

파일 보기

파일: test_fetcher.py 프로젝트: peterkingalex/ingestion

def test_absolute_url_fetcher_uva1():
    profile_path = "profiles/virginia.pjs"
    fetcher =  create_fetcher(profile_path, uri_base, config_file)
    assert fetcher.__class__.__name__ == "UVAFetcher"

    for response in fetcher.fetch_all_data():
        assert not response["errors"]
        assert response["records"]
        break

예제 #3

0

파일 보기

파일: test_fetcher.py 프로젝트: amber-reichert/ingestion

def test_file_fetcher_smithsonian():
    profile_path = "profiles/smithsonian.pjs"
    fetcher = create_fetcher(profile_path, uri_base)
    assert fetcher.__class__.__name__ == "EDANFetcher"

    fetcher.endpoint_url = "file:/%s/test/test_data/smithsonian/" % os.getcwd()
    for response in fetcher.fetch_all_data():
        assert response.get("error") == []
        assert getprop(response, "data/records") is not None
        break

예제 #4

0

파일 보기

파일: test_fetcher.py 프로젝트: amber-reichert/ingestion

def test_absolute_url_fetcher_ia():
    profile_path = "profiles/ia.pjs"
    fetcher =  create_fetcher(profile_path, uri_base)
    assert fetcher.__class__.__name__ == "IAFetcher"

    fetcher.endpoint_url_params["rows"] = 10
    for response in fetcher.fetch_all_data():
        assert response.get("error") == []
        assert getprop(response, "data/records") is not None
        break

예제 #5

0

파일 보기

파일: test_fetcher.py 프로젝트: peterkingalex/ingestion

def test_file_fetcher_smithsonian():
    profile_path = "profiles/smithsonian.pjs"
    fetcher = create_fetcher(profile_path, uri_base, config_file)
    assert fetcher.__class__.__name__ == "EDANFetcher"

    fetcher.endpoint_url = "file:/%s/test/test_data/smithsonian/" % os.getcwd()
    for response in fetcher.fetch_all_data():
        assert response["errors"] == []
        assert response["records"]
        break

예제 #6

0

파일 보기

파일: test_fetcher.py 프로젝트: peterkingalex/ingestion

def test_absolute_url_fetcher_ia():
    profile_path = "profiles/ia.pjs"
    fetcher =  create_fetcher(profile_path, uri_base, config_file)
    assert fetcher.__class__.__name__ == "IAFetcher"

    fetcher.endpoint_url_params["rows"] = 10
    for response in fetcher.fetch_all_data():
        assert not response["errors"]
        assert response["records"]
        break

예제 #7

0

파일 보기

파일: test_fetcher.py 프로젝트: amber-reichert/ingestion

def test_oai_fetcher_invalid_subresource():
    profile_path = "profiles/clemson.pjs"
    fetcher = create_fetcher(profile_path, uri_base)
    assert fetcher.__class__.__name__ == "OAIVerbsFetcher"

    fetcher.subresources = ["banana"]
    for response in fetcher.fetch_all_data():
        assert response.get("error") is not None
        assert getprop(response, "data/records") is None

    assert fetcher.subresources.keys() == []

예제 #8

0

파일 보기

파일: test_fetcher.py 프로젝트: peterkingalex/ingestion

def test_oai_fetcher_invalid_set():
    profile_path = "profiles/clemson.pjs"
    fetcher = create_fetcher(profile_path, uri_base, config_file)
    assert fetcher.__class__.__name__ == "OAIVerbsFetcher"

    fetcher.sets = ["banana"]
    for response in fetcher.fetch_all_data():
        assert response["errors"]
        assert not response["records"]

    assert fetcher.collections.keys() == []

예제 #9

0

파일 보기

파일: test_fetcher.py 프로젝트: amber-reichert/ingestion

def test_oai_fetcher_all_subresources():
    profile_path = "profiles/clemson.pjs"
    fetcher = create_fetcher(profile_path, uri_base)
    assert fetcher.__class__.__name__ == "OAIVerbsFetcher"

    for response in fetcher.fetch_all_data():
        assert response.get("error") is None
        assert getprop(response, "data/records") is not None

    diff = [subresource for subresource in scdl_all_subresources if
            subresource not in fetcher.subresources]
    assert diff == []

예제 #10

0

파일 보기

파일: test_fetcher.py 프로젝트: peterkingalex/ingestion

def test_oai_fetcher_all_sets():
    profile_path = "profiles/clemson.pjs"
    fetcher = create_fetcher(profile_path, uri_base, config_file)
    assert fetcher.__class__.__name__ == "OAIVerbsFetcher"

    for response in fetcher.fetch_all_data():
        assert not response["errors"]
        assert response["records"]

    diff = [s for s in scdl_all_sets if
            s not in fetcher.collections]
    assert diff == []

예제 #11

0

파일 보기

파일: test_fetcher.py 프로젝트: amber-reichert/ingestion

def test_oai_fetcher_with_blacklist():
    profile_path = "profiles/clemson.pjs"
    fetcher = create_fetcher(profile_path, uri_base)
    assert fetcher.__class__.__name__ == "OAIVerbsFetcher"

    fetcher.blacklist = scdl_blacklist
    for response in fetcher.fetch_all_data():
        pass

    subresources = list(set(scdl_all_subresources) - set(scdl_blacklist))
    diff = [subresource for subresource in subresources if
            subresource not in fetcher.subresources]
    assert diff == []

예제 #12

0

파일 보기

파일: test_fetcher.py 프로젝트: peterkingalex/ingestion

def test_oai_fetcher_with_blacklist():
    profile_path = "profiles/clemson.pjs"
    fetcher = create_fetcher(profile_path, uri_base, config_file)
    assert fetcher.__class__.__name__ == "OAIVerbsFetcher"

    fetcher.blacklist = scdl_blacklist
    for response in fetcher.fetch_all_data():
        pass

    sets = list(set(scdl_all_sets) - set(scdl_blacklist))
    diff = [s for s in sets if
            s not in fetcher.collections]
    assert diff == []

예제 #13

0

파일 보기

파일: test_fetcher.py 프로젝트: amber-reichert/ingestion

def test_all_oai_verb_fetchers():
    for profile in os.listdir("profiles"):
        if profile.endswith(".pjs"):
            profile_path = "profiles/" + profile
            with open(profile_path, "r") as f:
                prof = json.loads(f.read())
            if prof.get("type") == "oai_verbs":
                fetcher =  create_fetcher(profile_path, uri_base)
                assert fetcher.__class__.__name__ == "OAIVerbsFetcher"

                # Digital Commonwealth sets 217, 218 are giving errors
                if prof.get("name") == "digital-commonwealth":
                    fetcher.blacklist.extend(["217", "218"])

                for response in fetcher.fetch_all_data():
                    assert response.get("error") is None
                    assert getprop(response, "data/records") is not None
                    break

예제 #14

0

파일 보기

파일: test_fetcher.py 프로젝트: peterkingalex/ingestion

def test_all_oai_verb_fetchers():
    for profile in os.listdir("profiles"):
        if profile.endswith(".pjs"):
            # David Rumsey ListSets is returning 500 on hz4 and Travis
            if profile == "david_rumsey.pjs":
                continue

            profile_path = "profiles/" + profile
            with open(profile_path, "r") as f:
                prof = json.loads(f.read())
            if prof.get("type") == "oai_verbs":
                fetcher =  create_fetcher(profile_path, uri_base, config_file)
                assert fetcher.__class__.__name__ == "OAIVerbsFetcher"

                # Digital Commonwealth sets 217, 218 are giving errors
                if prof.get("name") == "digital-commonwealth":
                    fetcher.blacklist.extend(["217", "218"])

                for response in fetcher.fetch_all_data():
                    assert not response["errors"]
                    assert response["records"]
                    break

예제 #15

0

파일 보기

파일: fetch_records.py 프로젝트: peterkingalex/ingestion

def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]

    # Update ingestion document
    fetch_dir = create_fetch_dir(ingestion_doc["provider"])
    kwargs = {
        "fetch_process/status": "running",
        "fetch_process/data_dir": fetch_dir,
        "fetch_process/start_time": datetime.now().isoformat(),
        "fetch_process/end_time": None,
        "fetch_process/error": None,
        "fetch_process/total_items": None,
        "fetch_process/total_collections": None,
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        logger.error("Error updating ingestion doc %s in %s" % (ingestion_doc["_id"], __name__))
        return -1

    error_msg = []
    config_file = "akara.ini"
    fetcher = create_fetcher(ingestion_doc["profile_path"], ingestion_doc["uri_base"], config_file)

    print "Fetching records for " + fetcher.provider
    total_items = 0
    total_collections = 0
    for response in fetcher.fetch_all_data():
        if response["errors"]:
            error_msg.extend(iterify(response["errors"]))
            print response["errors"]
        if response["records"]:
            # Write records to file
            filename = os.path.join(fetch_dir, str(uuid.uuid4()))
            with open(filename, "w") as f:
                f.write(json.dumps(response["records"]))

            items = len([record for record in response["records"] if not record.get("ingestType") == "collection"])
            total_items += items
            total_collections += len(response["records"]) - items

    print "Total items: %s" % total_items
    print "Total collections: %s" % total_collections

    # Update ingestion document
    try:
        os.rmdir(fetch_dir)
        # Error if fetch_dir was empty
        status = "error"
        error_msg.append("Error, no records fetched")
        logger.error(error_msg)
    except:
        status = "complete"
    kwargs = {
        "fetch_process/status": status,
        "fetch_process/error": error_msg,
        "fetch_process/end_time": datetime.now().isoformat(),
        "fetch_process/total_items": total_items,
        "fetch_process/total_collections": total_collections,
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        logger.error("Error updating ingestion doc %s in %s" % (ingestion_doc["_id"], __name__))
        return -1

    return 0 if status == "complete" else -1

예제 #16

0

파일 보기

파일: fetch_records.py 프로젝트: amber-reichert/ingestion

def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]

    # Update ingestion document
    fetch_dir = create_fetch_dir(ingestion_doc["provider"])
    kwargs = {
        "fetch_process/status": "running",
        "fetch_process/data_dir": fetch_dir,
        "fetch_process/start_time": datetime.now().isoformat()
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        logger.error("Error updating ingestion doc %s in %s" %
                     (ingestion_doc["_id"], __name__))
        return -1

    error_msg = []
    fetcher = create_fetcher(ingestion_doc["profile_path"],
                             ingestion_doc["uri_base"])

    print "Fetching records for " + fetcher.provider
    total_fetched_records = 0
    for response in fetcher.fetch_all_data():
        if response["error"]:
            error_msg.extend(iterify(response["error"]))
            print response["error"]
        else:
            # Write records to file
            filename = os.path.join(fetch_dir, str(uuid.uuid4()))
            with open(filename, "w") as f:
                f.write(json.dumps(response["data"]))
            print "Records written to " + filename
            total_fetched_records += len(getprop(response, "data/records"))

    logger.info("Total records fetched: %s" % total_fetched_records)

    # Update ingestion document
    try:
        os.rmdir(fetch_dir)
        # Error if fetch_dir was empty
        status = "error"
        error_msg.append("Error, no records fetched")
        logger.error(error_msg)
    except:
        status = "complete"
    kwargs = {
        "fetch_process/status": status,
        "fetch_process/error": error_msg,
        "fetch_process/end_time": datetime.now().isoformat()
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        logger.error("Error updating ingestion doc %s in %s" %
                     (ingestion_doc["_id"], __name__))
        return -1

    return 0 if status == "complete" else -1