Пример #1
0
async def producer(queue, message):
    """Process messagees and schedule worker work items."""
    logger.info(
        "Scheduling [%s] work item for client [%s] message [%s]",
        message.message_type.name,
        message.client_id,
        message.message_id,
    )
    if message.message_type == MessageType.NEW and message.client_id:
        if message.data.get("use_model"):
            message.message_type = MessageType.MODEL
            asyncio.create_task(queue.put(message))
        elif message.data.get("filepath"):
            message.message_type = MessageType.SAMPLE
            asyncio.create_task(queue.put(message))
        elif message.data.get("labeled_pairs"):
            message.message_type = MessageType.LABEL
            asyncio.create_task(queue.put(message))
        elif message.data.get("labeling_complete"):
            message.message_type = MessageType.TRAIN
            asyncio.create_task(queue.put(message))
        elif message.data.get("training_complete"):
            message.message_type = MessageType.DEDUPE
            asyncio.create_task(queue.put(message))
        else:
            logger.error(
                "Unable to schedule message (%s). Unknown message data %s",
                message.message_id,
                message.data,
            )
    else:
        logger.error("Unable to schedule message %s invalid", message)
Пример #2
0
async def consumer(queue):
    """Consume scheduled work items."""
    while True:
        message = await queue.get()
        datastore = ds.get_datastore(message.client_id)
        logger.info(
            "Processing [%s] work item for client_id [%s] message_id [%s]",
            message.message_type.name,
            message.client_id,
            message.message_id,
        )
        if message.message_type == MessageType.MODEL:
            await datastore.model(message.data.get("filepath"),
                                  message.client_id)
            await datastore.dedupe()
            queue.task_done()
        if message.message_type == MessageType.SAMPLE:
            await datastore.sample(message.data.get("filepath"))
            queue.task_done()
        if message.message_type == MessageType.LABEL:
            await datastore.pairs(message.data.get("labeled_pairs"))
            queue.task_done()
        if message.message_type == MessageType.TRAIN:
            await datastore.train(message.client_id)
            queue.task_done()
        if message.message_type == MessageType.DEDUPE:
            await datastore.dedupe()
            queue.task_done()
Пример #3
0
def clean_df(df):
    logger.info("Cleaning dataframe")
    df = df.astype(str)
    df = df.applymap(lambda x: unidecode(x))
    df = df.applymap(lambda x: x.lower())
    df = df.replace({"nan": "", "none": "", "nat": ""})
    for i in df.columns:
        df[i] = df[i].str.replace(r"[^a-zA-Z0-9\/-]", "")
    return df
Пример #4
0
def select_fields(fields):
    logger.info("Generating data field mappings")

    def gen_field(field):
        if type(field) == str:
            return {"field": field, "type": "String"}
        if len(field) == 2:
            return {"field": field[0], "type": field[1]}

    return [gen_field(field) for field in fields]
Пример #5
0
def route_cors(host, app):
    """Add cors to all routes for supplied host."""
    logger.info("Configuring cors")
    cors = aiohttp_cors.setup(
        app,
        defaults={
            host:
            aiohttp_cors.ResourceOptions(allow_credentials=True,
                                         expose_headers="*",
                                         allow_headers="*")
        },
    )
    [cors.add(route) for route in list(app.router.routes())]
Пример #6
0
def data_cluster(deduper, data_dict, threshold):
    logger.debug("Clustering data")
    duplicates = deduper.match(data_dict, threshold)
    logger.info("Duplicate records found: %d", len(duplicates))

    df_data = [
        {"id": record_id, "cluster_id": cluster_id, "confidence": score}
        for cluster_id, records in enumerate(duplicates)
        for record_id, score in zip(*records)
    ]

    clustered_df = pd.DataFrame(df_data)
    clustered_df = clustered_df.set_index("id")

    return clustered_df
Пример #7
0
def deduplicate(
    df,
    recall_weight=1,
    sample_size=0.3,
    settings_file="training-data/dedupe_learned_settings",
    training_file="training-data/dedupe_training.json",
):
    fields = df.columns
    df, data_d = data_prep(df)

    if os.path.exists(settings_file):
        logger.info("Existing settings found. Loading from: %s", settings_file)
        with open(settings_file, "rb") as f:
            deduper = dedupe.StaticDedupe(f)
    else:
        fields = select_fields(fields)
        deduper = dedupe.Dedupe(fields)
        sample_num = math.floor(len(data_d) * sample_size)

        logger.info("Extracting data sample of %s records", sample_num)
        deduper.sample(data_d, sample_num)

        if os.path.exists(training_file):
            logger.info("Reading training examples from: %s", training_file)
            with open(training_file, "rb") as f:
                deduper.readTraining(f)

        logger.info("Starting active labeling")

        dedupe.consoleLabel(deduper)

        deduper.train()

        with open(training_file, "w") as tf:
            deduper.writeTraining(tf)
        with open(settings_file, "wb") as sf:
            deduper.writeSettings(sf)

    threshold = deduper.threshold(data_d, recall_weight=recall_weight)

    clustered_df = data_cluster(deduper, data_d, threshold)
    results = df.join(clustered_df, how="left")
    results.drop(["dictionary"], axis=1, inplace=True)

    return results
Пример #8
0
async def on_shutdown(app, signal=None):
    """Cleanup tasks tied to the service's shutdown."""
    if signal:
        logger.info(f"Received exit signal, shutting down.")
    tasks = [t for t in asyncio.all_tasks() if t is not asyncio.current_task()]

    logger.info(f"Cancelling {len(tasks)} outstanding tasks")
    [task.cancel() for task in tasks]

    await asyncio.gather(*tasks, return_exceptions=True)
    logger.info(f"Stopping")
Пример #9
0
def run():
    """Duple service runnner."""
    logger.info("Starting duple.")
    web.run_app(create_app())
    logger.info("Duple successfully shutdown.")
Пример #10
0
async def training_post(request):
    """Training endpoint for submitting labelled data.

    ---
    description: Accepts labeled data for training the data matching model.
    tags:
    - Training
    produces:
    - application/json
    responses:
        "200":
            description: Successful operation. Return further training records.
        "400":
            description: Unsuccessful operation. Labelled date not supplied.
    parameters:
      - in: header
        name: clientId
        schema:
          type: string
          format: uuid
        required: true
      - in: body
        name: body
        description: Labelled training data
        required: true
        schema:
          type: object
          properties:
            match:
              type: array
              items:
                type: array
                items:
                  - $ref: '#/definitions/Person'
                  - $ref: '#/definitions/Person'
            distinct:
              type: array
              items:
                type: array
                items:
                  - $ref: '#/definitions/Person'
                  - $ref: '#/definitions/Person'

    """
    if request.body_exists and request.can_read_body:
        logger.debug("Labelled training data recieved.")
        client_id = request.headers.get("clientId")
        datastore = get_datastore(client_id)

        if datastore.training_rounds < 4:
            logger.info("Updating traing pairs for labeling")
            labeled_pairs = await request.json()
            message = message_wrapper(client_id,
                                      {"labeled_pairs": labeled_pairs})
            await app["message_queue"].put(message)
        else:
            message = message_wrapper(client_id, {"labeling_complete": True})
            await app["message_queue"].put(message)
        return web.Response(status=201)

    return web.Response(status=400)
Пример #11
0
def console_deduplicate(filename):
    logger.info("Starting console deduplicator")
    df = pd.read_csv(filename)
    result = deduplicate(df)
    logger.info("Writing results file to relateddata.csv")
    result.to_csv("relateddata.csv", index=False)