def build_matches(cols, manifest_object): """ This function takes the columns and the manifests, and returns the match_ids grouped by column. Input example: [{"Column":"customer_id", "Type":"Simple"}] Output example: [{"Column":"customer_id", "Type":"Simple", "MatchIds":[123, 234]}] """ COMPOSITE_MATCH_TOKEN = "_S3F2COMP_" manifest = fetch_manifest(manifest_object) matches = {} for line in json_lines_iterator(manifest): if not line["QueryableColumns"] in matches: matches[line["QueryableColumns"]] = [] is_simple = len(line["Columns"]) == 1 match = line["MatchId"][0] if is_simple else line["MatchId"] matches[line["QueryableColumns"]].append(match) return list( map( lambda c: { "MatchIds": matches[COMPOSITE_MATCH_TOKEN.join(c["Columns"]) if "Columns" in c else c["Column"]], **c, }, cols, ))
def clear_deletion_queue(job): logger.info("Clearing successfully deleted matches") to_delete = set() for manifest_object in job.get("Manifests", []): manifest = fetch_job_manifest(manifest_object) for line in json_lines_iterator(manifest): to_delete.add(line["DeletionQueueItemId"]) with q_table.batch_writer() as batch: for item_id in to_delete: batch.delete_item(Key={"DeletionQueueItemId": item_id})
def test_it_iterates_over_json_lines(): json_content = '{"hello":123,"world":true}\n{"hello":456,"world":false}\n' result = json_lines_iterator(json_content) assert isinstance(result, types.GeneratorType) assert list(result) == [ { "hello": 123, "world": True }, { "hello": 456, "world": False }, ]
def test_it_iterates_over_json_lines_with_unparsed(): json_content = '{"hello":123,"world":true}\n{"hello":456,"world":false}\n' result = json_lines_iterator(json_content, include_unparsed=True) assert isinstance(result, types.GeneratorType) parsed = [] unparsed = [] for line, unparsed_line in result: parsed.append(line) unparsed.append(unparsed_line) assert list(parsed) == [ { "hello": 123, "world": True }, { "hello": 456, "world": False }, ] assert list(unparsed) == [ '{"hello":123,"world":true}', '{"hello":456,"world":false}', ]
def test_it_raises_exception_for_invalid_json(): json_content = '{"hello":123,"world":true}\nNOT_VALID\n' with pytest.raises(ValueError) as e: list(json_lines_iterator(json_content)) assert e.value.args[0] == ("Serialization error when parsing JSON lines: " "Expecting value: line 2 column 1 (char 0)")