Exemplo n.º 1
0
def build_matches(cols, manifest_object):
    """
    This function takes the columns and the manifests, and returns
    the match_ids grouped by column.
    Input example:
    [{"Column":"customer_id", "Type":"Simple"}]
    Output example:
    [{"Column":"customer_id", "Type":"Simple", "MatchIds":[123, 234]}]
    """
    COMPOSITE_MATCH_TOKEN = "_S3F2COMP_"
    manifest = fetch_manifest(manifest_object)
    matches = {}
    for line in json_lines_iterator(manifest):
        if not line["QueryableColumns"] in matches:
            matches[line["QueryableColumns"]] = []
        is_simple = len(line["Columns"]) == 1
        match = line["MatchId"][0] if is_simple else line["MatchId"]
        matches[line["QueryableColumns"]].append(match)
    return list(
        map(
            lambda c: {
                "MatchIds":
                matches[COMPOSITE_MATCH_TOKEN.join(c["Columns"])
                        if "Columns" in c else c["Column"]],
                **c,
            },
            cols,
        ))
def clear_deletion_queue(job):
    logger.info("Clearing successfully deleted matches")
    to_delete = set()
    for manifest_object in job.get("Manifests", []):
        manifest = fetch_job_manifest(manifest_object)
        for line in json_lines_iterator(manifest):
            to_delete.add(line["DeletionQueueItemId"])

    with q_table.batch_writer() as batch:
        for item_id in to_delete:
            batch.delete_item(Key={"DeletionQueueItemId": item_id})
def test_it_iterates_over_json_lines():
    json_content = '{"hello":123,"world":true}\n{"hello":456,"world":false}\n'
    result = json_lines_iterator(json_content)
    assert isinstance(result, types.GeneratorType)
    assert list(result) == [
        {
            "hello": 123,
            "world": True
        },
        {
            "hello": 456,
            "world": False
        },
    ]
def test_it_iterates_over_json_lines_with_unparsed():
    json_content = '{"hello":123,"world":true}\n{"hello":456,"world":false}\n'
    result = json_lines_iterator(json_content, include_unparsed=True)
    assert isinstance(result, types.GeneratorType)
    parsed = []
    unparsed = []
    for line, unparsed_line in result:
        parsed.append(line)
        unparsed.append(unparsed_line)
    assert list(parsed) == [
        {
            "hello": 123,
            "world": True
        },
        {
            "hello": 456,
            "world": False
        },
    ]
    assert list(unparsed) == [
        '{"hello":123,"world":true}',
        '{"hello":456,"world":false}',
    ]
def test_it_raises_exception_for_invalid_json():
    json_content = '{"hello":123,"world":true}\nNOT_VALID\n'
    with pytest.raises(ValueError) as e:
        list(json_lines_iterator(json_content))
    assert e.value.args[0] == ("Serialization error when parsing JSON lines: "
                               "Expecting value: line 2 column 1 (char 0)")