Exemplo n.º 1
0
async def stats(request):
    """Stats endpoint for retrieving classification statistics.

    ---
    description: Supplies information reagarding the records processed.
    tags:
    - Results
    produces:
    - application/json
    responses:
        "200":
            description: successful operation. Return duple statistics.
    parameters:
      - in: header
        name: clientId
        schema:
          type: string
          format: uuid
        required: true

    """
    client_id = request.headers.get("clientId")
    datastore = get_datastore(client_id)
    if datastore.has_result:
        return web.json_response(datastore.stats)
    else:
        return web.json_response({})
Exemplo n.º 2
0
async def training_get(request):
    """Training endpoint for retrieving sample data.

    ---
    description: Supplies data used to train the duple data matching model.
    tags:
    - Training
    produces:
    - application/json
    responses:
        "200":
            description: successful operation. Return unlabeled training records.
    parameters:
      - in: header
        name: clientId
        schema:
          type: string
          format: uuid
        required: true

    """
    logger.debug("Training data request recieved")
    client_id = request.headers.get("clientId")
    datastore = get_datastore(client_id)

    if datastore.training_rounds <= 4:
        training_data = await datastore.get_pairs()
        return web.json_response(training_data)
    else:
        return web.json_response([])
Exemplo n.º 3
0
async def consumer(queue):
    """Consume scheduled work items."""
    while True:
        message = await queue.get()
        datastore = ds.get_datastore(message.client_id)
        logger.info(
            "Processing [%s] work item for client_id [%s] message_id [%s]",
            message.message_type.name,
            message.client_id,
            message.message_id,
        )
        if message.message_type == MessageType.MODEL:
            await datastore.model(message.data.get("filepath"),
                                  message.client_id)
            await datastore.dedupe()
            queue.task_done()
        if message.message_type == MessageType.SAMPLE:
            await datastore.sample(message.data.get("filepath"))
            queue.task_done()
        if message.message_type == MessageType.LABEL:
            await datastore.pairs(message.data.get("labeled_pairs"))
            queue.task_done()
        if message.message_type == MessageType.TRAIN:
            await datastore.train(message.client_id)
            queue.task_done()
        if message.message_type == MessageType.DEDUPE:
            await datastore.dedupe()
            queue.task_done()
Exemplo n.º 4
0
async def results(request):
    """Results endpoint for retrieving classified data.

    ---
    description: Supplies clustered data containing duplicates.
    tags:
    - Results
    produces:
    - application/json
    responses:
        "200":
            description: successful operation. Return labeled results.
    parameters:
      - in: header
        name: clientId
        schema:
          type: string
          format: uuid
        required: true

    """
    client_id = request.headers.get("clientId")
    datastore = get_datastore(client_id)

    if datastore.has_result and datastore.result.size > 0:
        result = datastore.result
        result = (
            result[result.cluster_id > 0].sort_values("cluster_id").to_json(
                orient="table"))
        return web.Response(body=result, content_type="application/json")
    else:
        if await datastore.get_status("training"):
            message = message_wrapper(client_id, {"training_complete": True})
            await app["message_queue"].put(message)
        if await datastore.get_status("dedupe") and datastore.result.size > 0:
            result = datastore.result
            result = (result[result.cluster_id > 0].sort_values(
                "cluster_id").to_json(orient="table"))
            return web.Response(body=result, content_type="application/json")
        if not datastore.has_result or datastore.result.size <= 0:
            return web.json_response({})
Exemplo n.º 5
0
async def results_file(request):
    """Results endpoint for retrieving classified data results file.

    ---
    description: Supplies a file containing duplicates found.
    tags:
    - Results
    produces:
    - text/csv
    responses:
        "200":
            description: successful operation. Return labeled results file.
    parameters:
      - in: header
        name: clientId
        schema:
          type: string
          format: uuid
        required: true

    """
    params = request.rel_url.query
    datastore = get_datastore(params.get("clientId"))
    if datastore.has_result:
        result = datastore.result
        result = (
            result[result.cluster_id > 0].sort_values("cluster_id").to_csv(
                mode="wb", index=False))
        return web.Response(
            headers=MultiDict({
                "Content-Disposition":
                'attachment; filename="relateddata.csv"'
            }),
            body=result,
        )
    else:
        return web.Response(status=503)
Exemplo n.º 6
0
def test_delete_empty_datastores(modify_repository, existing_datastore,
                                 new_datastore):
    delete_datastore("test234")
    assert get_datastore("test234") == new_datastore
Exemplo n.º 7
0
def test_get_existing_datastore(modify_repository, existing_datastore,
                                new_datastore):
    assert get_datastore("test234") != new_datastore
    assert get_datastore("test234") == existing_datastore
    assert get_datastore("test567") != get_datastore("test234")
Exemplo n.º 8
0
def test_get_new_datastore(new_datastore):
    assert get_datastore("test123") == new_datastore
Exemplo n.º 9
0
async def training_post(request):
    """Training endpoint for submitting labelled data.

    ---
    description: Accepts labeled data for training the data matching model.
    tags:
    - Training
    produces:
    - application/json
    responses:
        "200":
            description: Successful operation. Return further training records.
        "400":
            description: Unsuccessful operation. Labelled date not supplied.
    parameters:
      - in: header
        name: clientId
        schema:
          type: string
          format: uuid
        required: true
      - in: body
        name: body
        description: Labelled training data
        required: true
        schema:
          type: object
          properties:
            match:
              type: array
              items:
                type: array
                items:
                  - $ref: '#/definitions/Person'
                  - $ref: '#/definitions/Person'
            distinct:
              type: array
              items:
                type: array
                items:
                  - $ref: '#/definitions/Person'
                  - $ref: '#/definitions/Person'

    """
    if request.body_exists and request.can_read_body:
        logger.debug("Labelled training data recieved.")
        client_id = request.headers.get("clientId")
        datastore = get_datastore(client_id)

        if datastore.training_rounds < 4:
            logger.info("Updating traing pairs for labeling")
            labeled_pairs = await request.json()
            message = message_wrapper(client_id,
                                      {"labeled_pairs": labeled_pairs})
            await app["message_queue"].put(message)
        else:
            message = message_wrapper(client_id, {"labeling_complete": True})
            await app["message_queue"].put(message)
        return web.Response(status=201)

    return web.Response(status=400)