Пример #1
0
def test_list_collections():
    client = common.login_with_test_user(common.client())
    unarchived_collections = client.list_collections(False)
    assert type(unarchived_collections) is list
    assert len(unarchived_collections) > 0
    all_collections = client.list_collections(True)
    assert type(all_collections) is list
    assert len(all_collections) >= len(unarchived_collections)

    # make sure every collection has some expected properties
    for col in (all_collections + unarchived_collections):
        assert "_id" in col
        assert "annotators" in col
        assert "viewers" in col
        assert "configuration" in col
        assert "metadata" in col

    # make sure all_collections is superset of unarchived_collections
    for col in unarchived_collections:
        found = False
        for col2 in all_collections:
            if col["_id"] == col2["_id"]:
                found = True
                break
        assert found == True

    # make sure anything in all that's not in unarchived is archived
    for col in all_collections:
        found = False
        for col2 in unarchived_collections:
            if col["_id"] == col2["_id"]:
                found = True
                break
        assert col["archived"] == (not found)
Пример #2
0
def test_add_and_delete_documents():
    client = common.login_with_test_user(common.client())

    collection_id = common.get_collection_id(client, "Trial Collection")
    assert collection_id != None

    added_doc_id = client.add_document(collection_id=collection_id,
                                       text="This is a test document to be deleted.")
    assert added_doc_id != None
    
    added_doc_id_2 = client.add_document(collection_id=collection_id,
                                       text="This is a test document to be deleted.")
    assert added_doc_id_2 != None
    
    delete_resp = client.delete_documents([added_doc_id, added_doc_id_2])
    assert delete_resp != None
    assert isinstance(delete_resp, dict)
    
    assert delete_resp["success"]
    
    changed_objs = delete_resp["changed_objs"]
    assert len(changed_objs["annotations"]["deleted"]) == 0
    assert added_doc_id in changed_objs["documents"]["deleted"]
    assert added_doc_id_2 in changed_objs["documents"]["deleted"]
    
Пример #3
0
def test_collection_user_permissions():
    client = common.login_with_test_user(common.client())

    collection_id = _get_collection_id("NER Test Collection", client)
    assert collection_id != None
    permissions = client.get_collection_permissions(collection_id)
    assert permissions.to_dict() == {
        "view": True,
        "annotate": True,
        "add_documents": True,
        "add_images": True,
        "modify_users": False,
        "modify_labels": False,
        "modify_document_metadata": True,
        "download_data": False,
        "archive": False
    }

    collection_id = _get_collection_id("Trial Collection", client)
    assert collection_id != None
    permissions = client.get_collection_permissions(collection_id)
    assert permissions.to_dict() == {
        "view": True,
        "annotate": True,
        "add_documents": True,
        "add_images": True,
        "modify_users": True,
        "modify_labels": True,
        "modify_document_metadata": True,
        "download_data": True,
        "archive": True
    }
Пример #4
0
def test_get_classifier_status():
    client = common.login_with_test_user(common.client())

    for collection in client.list_collections():
        classifier = client.get_collection_classifier(collection["_id"])
        status = client.get_classifier_status(classifier["_id"])
        _check_pipeline_status(status, classifier["_id"])
Пример #5
0
def test_delete_document_not_allowed():
    client = common.login_with_test_user(common.client())

    collection_id = common.get_collection_id(client, "Trial Collection")
    assert collection_id != None
    
    with pytest.raises(pine.client.exceptions.PineClientHttpException):
        client.delete_document("nonexistent")
Пример #6
0
def test_get_collection_classifier():
    client = common.login_with_test_user(common.client())

    # make sure that a correct classifier is returned for each collection
    for collection in client.list_collections():
        classifier = client.get_collection_classifier(collection["_id"])
        assert type(classifier) is dict
        assert classifier["collection_id"] == collection["_id"]
        assert classifier["pipeline_id"] is not None
Пример #7
0
def test_get_pipelines():
    client = common.login_with_test_user(common.client())

    imported_pipelines = common.test_pipeline_data()
    expected_pipeline_ids = [
        pipeline["_id"] for pipeline in imported_pipelines
    ]

    pipelines = client.get_pipelines()
    actual_pipeline_ids = [pipeline["_id"] for pipeline in pipelines]
    assert set(expected_pipeline_ids) == set(actual_pipeline_ids)

    for pipeline in pipelines:
        expected = [
            p for p in imported_pipelines if p["_id"] == pipeline["_id"]
        ][0]
        for key in expected:
            assert expected[key] == pipeline[key]
Пример #8
0
def test_collection_creation_and_get_and_archive():
    client = common.login_with_test_user(common.client())
    pipeline_id = [p for p in client.get_pipelines() if p["name"].lower() == "spacy"][0]["_id"]
    assert pipeline_id is not None
    my_id = client.get_my_user_id()
    assert my_id is not None
    
    collection_builder = client.collection_builder() \
        .viewer(my_id) \
        .annotator(my_id) \
        .label("label") \
        .title("Collection to Test Creation") \
        .description("This is a collection for pytest to test creation.") \
        .classifier(pipeline_id, train_every=100)
    collection_id = client.create_collection(collection_builder)
    assert type(collection_id) is str
    
    try:
        collection = client.get_collection(collection_id)
        assert type(collection) is dict
        
        assert collection["_id"] == collection_id
        assert collection["creator_id"] == my_id
        assert collection["annotators"] == [my_id]
        assert collection["viewers"] == [my_id]
        assert collection["archived"] == False
        assert collection["labels"] == ["label"]
        assert collection["metadata"] == {
            "title": "Collection to Test Creation",
            "description": "This is a collection for pytest to test creation."
        }
        
        updated_collection = client.archive_collection(collection_id, True)
        assert type(updated_collection) is dict
        assert updated_collection["_id"] == collection_id
        assert updated_collection["archived"] == True
        
        updated_collection = client.archive_collection(collection_id, False)
        assert type(updated_collection) is dict
        assert updated_collection["_id"] == collection_id
        assert updated_collection["archived"] == False
    finally:
        client.archive_collection(collection_id, True)
Пример #9
0
def test_sync_train():
    client = common.login_with_test_user(common.client())

    collection = common.get_collection(client, "Small Collection OpenNLP")
    assert collection is not None
    collection_id = collection["_id"]
    assert collection_id is not None
    classifier_id = client.get_collection_classifier(collection_id)["_id"]
    assert classifier_id is not None

    train_job_data = client.classifier_train(classifier_id, do_async=False)
    _assert_job_response(train_job_data, True)
    results = train_job_data["job_response"]
    assert results is not None and isinstance(results, dict)
    assert "average_metrics" in results and isinstance(
        results["average_metrics"], dict)
    assert "updated_objects" in results and isinstance(
        results["updated_objects"], dict)
    assert "fit" in results and isinstance(results["fit"], dict)
    assert "model_filename" in results and isinstance(
        results["model_filename"], str)
Пример #10
0
def _test_train_and_predict(collection_title):
    client = common.login_with_test_user(common.client())

    collection = common.get_collection(client, collection_title)
    assert collection is not None
    collection_id = collection["_id"]
    assert collection_id is not None
    labels = collection["labels"]
    assert labels is not None and len(labels) > 0
    classifier_id = client.get_collection_classifier(collection_id)["_id"]
    assert classifier_id is not None
    first_document = client.get_collection_documents(collection_id,
                                                     truncate=False)[0]
    document_id = first_document["_id"]
    document_text = first_document["text"]
    assert document_text.startswith("Thousands of demonstrators have ")

    # train async
    train_job_data = client.classifier_train(classifier_id, do_async=True)
    train_job_id = _assert_job_response(train_job_data, False)
    common.wait_for_job_to_finish(client,
                                  classifier_id,
                                  train_job_id,
                                  max_wait_seconds=120)
    status = client.get_classifier_status(classifier_id)
    _check_pipeline_status(status, classifier_id)
    assert status["job_response"]["has_trained"]
    train_job_results = client.get_classifier_job_results(
        classifier_id, train_job_id)
    assert train_job_results != None and isinstance(train_job_results, dict)

    # predict from ID sync
    prediction_job_data = client.classifier_predict(classifier_id,
                                                    [document_id], [],
                                                    do_async=False)
    prediction_job_id = _assert_job_response(prediction_job_data, True)
    docs_by_id = prediction_job_data["job_response"]["documents_by_id"]
    texts = prediction_job_data["job_response"]["texts"]
    assert docs_by_id.keys() == {document_id}
    prediction_from_id = docs_by_id[document_id]
    assert len(texts) == 0

    # predict from text async
    prediction_job_data = client.classifier_predict(classifier_id, [],
                                                    [document_text],
                                                    do_async=True)
    prediction_job_id = _assert_job_response(prediction_job_data, False)
    common.wait_for_job_to_finish(client,
                                  classifier_id,
                                  prediction_job_id,
                                  max_wait_seconds=120)
    prediction_job_data = client.get_classifier_job_results(
        classifier_id, prediction_job_id)
    assert prediction_job_data != None and isinstance(prediction_job_data,
                                                      dict)
    docs_by_id = prediction_job_data["documents_by_id"]
    texts = prediction_job_data["texts"]
    assert len(docs_by_id) == 0
    assert len(texts) == 1
    prediction_from_text = texts[0]

    # should be the same
    assert prediction_from_id == prediction_from_text

    # make sure they're in the right format
    assert isinstance(prediction_from_id, dict)
    assert "doc" in prediction_from_id and "ner" in prediction_from_id
    assert isinstance(prediction_from_id["doc"], list)
    for pred in prediction_from_id["doc"]:
        assert isinstance(pred, str)
        assert pred in labels
    assert isinstance(prediction_from_id["ner"], list)
    for pred in prediction_from_id["ner"]:
        assert isinstance(pred, list) and isinstance(
            pred[0], int) and isinstance(pred[1], int) and isinstance(
                pred[2], str)
        assert pred[0] >= 0 and pred[1] > pred[0]
        assert pred[2] in labels

    return prediction_from_id
Пример #11
0
def test_get_pipeline_status():
    client = common.login_with_test_user(common.client())

    for pipeline in common.test_pipeline_data():
        status = client.get_pipeline_status(pipeline["_id"])
        _check_pipeline_status(status, None)
Пример #12
0
def test_get_and_advance_next_documents(tmp_path):
    client = common.login_with_test_user(common.client())
    pipeline_id = [
        p for p in client.get_pipelines() if p["name"].lower() == "spacy"
    ][0]["_id"]
    assert pipeline_id is not None
    my_id = client.get_my_user_id()
    assert my_id is not None

    # write documents CSV
    documents_file = tmp_path / "documents.csv"
    with open(documents_file, "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        for i in range(5):
            writer.writerow(["This is document number {}.".format(i)])

    collection = client.collection_builder() \
        .viewer(my_id) \
        .annotator(my_id) \
        .label("label") \
        .title("Collection to Test Next Document") \
        .description("This is a collection for pytest to test the next/advance documents feature.") \
        .classifier(pipeline_id, train_every=100, overlap=1) \
        .document_csv_file(documents_file, has_header=False, text_column=0)
    collection_id = client.create_collection(collection)
    assert collection_id is not None

    document_ids = [
        d["_id"]
        for d in client.get_collection_documents(collection_id, True, 1)
    ]
    assert len(document_ids) == 5

    try:
        classifier = client.get_collection_classifier(collection_id)
        assert classifier is not None
        classifier_id = classifier["_id"]
        assert classifier_id is not None

        # add more documents
        document_ids += [
            client.add_document(collection_id=collection_id,
                                overlap=1,
                                text="This is document number {}.".format(i))
            for i in range(5, 10)
        ]
        assert len(document_ids) == 10
        for document_id in document_ids:
            assert type(document_id) is str

        next_ids = []
        next_id = client.get_next_document(classifier_id)
        while next_id is not None:
            assert type(next_id) is str
            assert next_id not in next_ids  # no duplicates
            next_ids.append(next_id)
            assert len(next_ids) <= len(
                document_ids)  # sanity check to prevent an infinite loop
            updated_document = client.advance_next_document(
                classifier_id, next_id)
            assert type(updated_document) is dict
            next_id = client.get_next_document(classifier_id)
        assert set(document_ids) == set(next_ids)
    finally:
        client.archive_collection(collection_id)