def main(): with tempfile.TemporaryDirectory() as tempdir: with open(os.path.join(tempdir, ".netrc"), "w") as f: f.write("machine %s\nlogin %s\npassword %s\n" % (DOMAIN, AUTH_USER, AUTH_PASSWORD)) HOME = os.environ["HOME"] os.environ["HOME"] = tempdir os.makedirs("data/corona_data_collector/gps_data_cache", exist_ok=True) utils.http_stream_download("data/corona_data_collector/gps_data_cache/datapackage.json", { "url": "https://%s/data/corona_data_collector/gps_data_cache/datapackage.json" % DOMAIN}) utils.http_stream_download("data/corona_data_collector/gps_data_cache/gps_data.csv", { "url": "https://%s/data/corona_data_collector/gps_data_cache/gps_data.csv" % DOMAIN}) Flow( download_gdrive_data.flow({ "limit_rows": 5000, "files_dump_to_path": "data/corona_data_collector/gdrive_data", "google_drive_csv_folder_id": "1pzAyk-uXy__bt1tCX4rpTiPZNmrehTOz", "file_sources": { "COVID-19-English.csv": "google", "COVID-19-Russian.csv": "google", "COVID-19-Hebrew.csv": "hebrew_google", } }), load_from_db.flow({ "where": " (id > 500 and id < 1000 ) " "or (id > 180000 and id < 185000) " "or (id > 321000 and id < 322000) " "or (id > 462000 and id < 463000) " "or (id > 600000 and id < 601000) " "or (id > 640000 and id < 641000) " "or (id > 670000 and id < 670500) " "or (id < 849000 and id < 855000) " "or (id > 860000 and id < 865000) ", "filter_db_row_callback": _filter_db_row_callback }), add_gps_coordinates.flow({ "source_fields": utils.get_parameters_from_pipeline_spec("pipeline-spec.yaml", "corona_data_collector", "corona_data_collector.add_gps_coordinates")["source_fields"], "workplace_source_fields": utils.get_parameters_from_pipeline_spec("pipeline-spec.yaml", "corona_data_collector", "corona_data_collector.add_gps_coordinates")["workplace_source_fields"], "dump_to_path": "data/corona_data_collector/with_gps_data", "gps_datapackage_path": "data/corona_data_collector/gps_data_cache", "get-coords-callback": lambda street, city: (random.uniform(29, 34), random.uniform(34, 36), int(street != city)) }), export_corona_bot_answers.flow({ "destination_output": "data/corona_data_collector/corona_bot_answers" }), export_corona_bot_answers.flow({ "unsupported": True, "destination_output": "data/corona_data_collector/corona_bot_answers_unsupported" }) ).process() os.environ["HOME"] = HOME subprocess.check_call(["python3", "-m", "src.utils.get_raw_data"], cwd="../COVID19-ISRAEL", env={ **os.environ, "GOOGLE_SERVICE_ACCOUNT_FILE": os.environ["GOOGLE_SERVICE_ACCOUNT_FILE"], "AVIDCOVIDER_LOCAL_PATH": os.getcwd() }) subprocess.check_call(["python3", "-m", "src.utils.preprocess_raw_data"], cwd="../COVID19-ISRAEL", env={ **os.environ }) logging.info("Great Success!")
def test_expected_contact_with_patient(): print("test_expected_contact_with_patient") back_from_abroad_db = [169603, 169632, 169813] contact_with_patient_db = [10722, 10715, 10697] Flow( load_from_db.flow({ "where": "id in (%s)" % ", ".join(map(str, back_from_abroad_db + contact_with_patient_db)) }), add_gps_coordinates.flow({ "source_fields": get_parameters_from_pipeline_spec( "pipeline-spec.yaml", "corona_data_collector", "corona_data_collector.add_gps_coordinates")["source_fields"], "get-coords-callback": lambda street, city: (random.uniform(29, 34), random.uniform( 34, 36), int(street != city)) }), export_corona_bot_answers.flow({ "destination_output": "data/corona_data_collector/destination_output" }), ).process() contact_with_patient_key = values_to_convert['insulation_status'][ 'contact-with-patient'] back_from_abroad_key = values_to_convert['insulation_status'][ 'back-from-abroad'] contact_with_patient_array = [] back_from_abroad_array = [] counts = {"contact_with_patient": 0, "back_from_abroad": 0} def _test(row): if int(row["isolation"]) == contact_with_patient_key: counts["contact_with_patient"] += 1 contact_with_patient_array.append(int(row["id"])) if int(row["isolation"]) == back_from_abroad_key: assert int(row["id"]) in back_from_abroad_db counts["back_from_abroad"] += 1 back_from_abroad_array.append(int(row["id"])) Flow( load( 'data/corona_data_collector/destination_output/corona_bot_answers_25_3_2020_with_coords.csv' ), load( 'data/corona_data_collector/destination_output/corona_bot_answers_22_3_2020_with_coords.csv' ), _test, ).process() assert 3 == counts["contact_with_patient"], str(counts) assert 3 == counts["back_from_abroad"], str(counts) assert set(back_from_abroad_array) == set(back_from_abroad_db) assert set(contact_with_patient_array) == set(contact_with_patient_db) print("OK")
def get_db_test_row(version=None, field_name=None, value=None, where=None, show_fields=None, limit_rows=10, db_dump_to_path=None): if not where: where = [] if version: where.append("data->>'version' = '%s'" % version) if field_name and value: where.append("data->>'%s' = '%s'" % (field_name, value)) where = " and ".join(where) if not show_fields: if field_name: show_fields = [field_name] else: show_fields = [] Flow( load_from_db.flow({ "where": where, "limit_rows": limit_rows, **({ "dump_to_path": db_dump_to_path } if db_dump_to_path else {}), }), add_gps_coordinates.flow({ "source_fields": get_parameters_from_pipeline_spec( "pipeline-spec.yaml", "corona_data_collector", "corona_data_collector.add_gps_coordinates")["source_fields"], "workplace_source_fields": get_parameters_from_pipeline_spec( "pipeline-spec.yaml", "corona_data_collector", "corona_data_collector.add_gps_coordinates") ["workplace_source_fields"], "get-coords-callback": lambda street, city: (random.uniform(29, 34), random.uniform( 34, 36), int(street != city)) }), export_corona_bot_answers.flow({ "destination_output": "data/corona_data_collector/destination_output" }), printer(fields=["__id", "__created", *show_fields]), ).process()
def test_isolated_total_count(): print("test_isolated_total_count") db_isolated_id = [ 169603, 169630, 169632, 169637, 169690, 169728, 169753, 169813, 169829, 169837, 169882, 169924, 169930, 170014, 170042, 170064, 170067, 170097, 170099, 170127, 170184, 170223, 170234, 170244, 170263, 170272, 170289, 170322, 170326, 170328, 170350, 170370, 170390, 170414, 170428, 170432, 170436, 170438, 170442, 170448, 170453, 170478, 170479, 170621, 170629, 170685, 170735, 170744, 170777, 170811, 170878, 170886, 170903, 170929, 170936, 170962, 170970, 170989, 171009, 171018, 171078, 171097, 171123, 171127, 171132, 171133, 171142, 171158, 171162, 171200, 171201, 171230, 171256, 171268, 171283, 171288, 171290, 171302, 171323, 171337, 171342, 171374, 171399, 171440, 171472, 171499, 171506, 171541, 171571, 171590, 171599, 171615, 171686, 171718, 171720, 171753, 171823, 171865, 171900, 171904, 171907, 171991, 172048, 172076, 172153, 172155, 172163, 172165, 172218, 172225, 172231, 172233, 172236, 172263, 172276, 172277, 172316, 172367, 172373, 172406, 172419, 172458, 172483, 172491, 172492, 172505, 172511, 172537, 172542, 172594, 172596, 172629, 172637, 172638, 172644, 172716, 172727, 172733, 172749, 172750, 172789, 172797, 172808, 172810, 172894, 172923, 172925, 172952, 172956, 172972, 172995, 173006, 173077, 173087, 173112, 173177, 173178, 173186, 173199, 173211, 173222, 173272, 173275, 173335, 173336, 173377, 173436, 173466, 173507, 173524, 173579, 173671, 173768, 173816, 173965, 173973, 173979, 173980, 174018, 174040, 174049, 174055, 174063, 174082, 174084, 174095, 174099, 174144, 174146, 174167, 174202, 174206, 174232, 174236, 174239, 174242, 174258, 174259, 174263, 174267, 174271, 174295, 174313, 174332, 174350, 174359, 174369, 174372, 174374, 174394, 174405, 174411, 174443, 174456, 174470, 174496, 174506, 174511, 174541, 174617, 174652, 174744, 174768, 174779, 174813, 174830, 174840, 174850, 174859, 174865, 174890, 174910, 174997, 175018, 175025, 175027, 175056, 175128, 175154, 175159, 175167, 175179, 175235, 175280, 175290, 175332, 175339, 175373, 175424, 175443, 175455, 175465, 175470, 175492, 175503, 175519, 175537, 175542, 175628, 175644, 175684, 175691, 175730, 175765, 175773, 175790, 175831, 175849, 175857, 175863, 175880, 175883, 175887, 175894, 175908, 175976, 176035, 176040, 176046, 176076, 176124, 176132, 176198, 176202, 176211, 176241, 176288, 176300, 176340, 176364, 176386, 176408, 176435, 176453, 176466, 176478, 176490, 176501, 176534, 176574, 176613, 176617, 176674, 176681, 176804, 176825, 176827, 176860, 176889, 176926, 176930, 177008, 177045, 177107, 177113, 177118, 177122, 177136, 177207, 177211, 177238, 177296, 177363, 177381, 177409, 177418, 177426, 177512, 177559, 177575, 177608, 177627, 177721, 177732, 177780, 177798, 177810, 177865, 177870, 177905, 177945, 177947, 177953, 178091, 178118, 178138, 178186, 178217, 178252, 178289, 178304, 178328, 178420, 178508, 178511, 178517, 178525, 178551, 178603, 178604, 178681, 178700, 178713, 178742, 178750, 178756, 178781, 178792, 178836, 178848, 178867, 178881, 178910, 178939, 178955, 179016, 179033, 179065, 179066, 179074, 179160, 179185, 179212, 179225, 179250, 179270, 179281, 179294, 179338, 179376, 179418, 179480, 179492, 179549, 179594, 179621, 179661, 179664, 179669, 179683, 179702, 179714, 179758, 179768, 179769, 179888, 179982, 180002, 180010, 180021, 180027, 180044, 180074, 180123, 180125, 180131, 180136, 180145, 180169, 180198, 180271, 180284, 180383, 180394, 180438, 180448, 180478, 180505, 180511, 180553, 180575, 180579, 180587, 180629, 180725, 180747, 180795, 180798, 180840, 180888, 180941, 180943, 180944, 180964, 180991, 181023, 181037, 181049, 181120, 181162, 181164, 181192, 181218, 181220, 181230, 181252, 181304, 181326, 181339, 181410, 181445, 181483, 181520, 181555, 181562, 181599, 181630, 181665 ] Flow( load_from_db.flow( {"where": "id in (%s)" % ", ".join(map(str, db_isolated_id))}), add_gps_coordinates.flow({ "source_fields": get_parameters_from_pipeline_spec( "pipeline-spec.yaml", "corona_data_collector", "corona_data_collector.add_gps_coordinates")["source_fields"], "get-coords-callback": lambda street, city: (random.uniform(29, 34), random.uniform( 34, 36), int(street != city)) }), export_corona_bot_answers.flow({ "destination_output": "data/corona_data_collector/destination_output" }), ).process() counts = {"isolated": 0} def _test(row): if int(row["isolation"]) > 0: assert int(row["id"]) in db_isolated_id counts["isolated"] += 1 Flow( load( 'data/corona_data_collector/destination_output/corona_bot_answers_25_3_2020_with_coords.csv' ), _test, ).process() assert 468 == counts["isolated"], str(counts) print("OK")
def run_full_db_data_test(test_fields, test_data, dry_run=False, get_real_coords=False, mock_data=None): _test_data = {id: {} for id in test_data} def _db_row_callback(id, created, data): if id in test_data: if mock_data and id in mock_data: for mock_field, mock_value in mock_data[id].items(): data[mock_field] = mock_value for db_field, assert_values in test_data[id].items(): if data.get(db_field) != assert_values[0]: msg = "Invalid data in db field %s id %s. expected=%s actual=%s" % ( db_field, id, assert_values[0], data.get(db_field)) if dry_run: logging.info(msg) else: raise AssertionError(msg) logging.info("DB data is valid for id %s (validated %s fields)" % (id, len(test_data[id]))) _test_data[id]["created"] = created return id, created, data Flow( load_from_db.flow({ "where": "id in (%s)" % ", ".join(map(str, test_data.keys())), "filter_db_row_callback": _db_row_callback, }), add_gps_coordinates.flow({ "source_fields": get_parameters_from_pipeline_spec( "pipeline-spec.yaml", "corona_data_collector", "corona_data_collector.add_gps_coordinates")["source_fields"], "workplace_source_fields": get_parameters_from_pipeline_spec( "pipeline-spec.yaml", "corona_data_collector", "corona_data_collector.add_gps_coordinates") ["workplace_source_fields"], **({ "get-coords-callback": lambda street, city: (random.uniform(29, 34), random.uniform(34, 36), int(street != city)) } if get_real_coords == False else {}) }), export_corona_bot_answers.flow({ "destination_output": "data/corona_data_collector/destination_output" }), printer(fields=["__id", "__created", *test_fields.keys()]), ).process() _test_assertions = { str(id): [ "corona_bot_answers_%s_with_coords" % _test_data[id]["created"].strftime("%-d_%-m_%Y"), ] for id in test_data } for db_field in sorted(test_fields.keys()): for id, assertions in _test_assertions.items(): if len(test_data[int(id)][db_field]) == 1: assertions.append(test_data[int(id)][db_field][0]) else: assertions.append(test_data[int(id)][db_field][1]) def _test_corona_bot_answers_get_row(row): return (str(row[test_fields[db_field]]) for db_field in sorted(test_fields.keys())) try: Flow( *[ load( "data/corona_data_collector/destination_output/corona_bot_answers_%s_with_coords.csv" % created.strftime("%-d_%-m_%Y")) for created in set( [data["created"] for data in _test_data.values()]) ], test_corona_bot_answers(_test_corona_bot_answers_get_row, _test_assertions), printer( fields=["timestamp", "id", *test_fields.values()])).process() except AssertionError as e: raise AssertionError( str(e) + " fields: " + str([ test_fields[db_field] for db_field in sorted(test_fields.keys()) ])) logging.info("Great Success!")
import random import logging from corona_data_collector import load_from_db, add_gps_coordinates, export_corona_bot_answers from avid_covider_pipelines.utils import get_parameters_from_pipeline_spec from dataflows import printer, Flow, load from .common import test_corona_bot_answers logging.basicConfig(level=logging.INFO) Flow( load_from_db.flow({ "where": "id in (180074, 180075, 676579, 676580)" }), add_gps_coordinates.flow({ "source_fields": get_parameters_from_pipeline_spec("pipeline-spec.yaml", "corona_data_collector", "corona_data_collector.add_gps_coordinates")["source_fields"], "get-coords-callback": lambda street, city: (random.uniform(29, 34), random.uniform(34, 36), int(street != city)) }), export_corona_bot_answers.flow({ "destination_output": "data/corona_data_collector/destination_output" }), printer(fields=[ "__id", "__created", "main_age", "main_uid", "uid", "num_aliases" ]) ).process() Flow( load("data/corona_data_collector/destination_output/corona_bot_answers_29_4_2020_with_coords.csv"), load("data/corona_data_collector/destination_output/corona_bot_answers_25_3_2020_with_coords.csv"), test_corona_bot_answers(
import random import logging from corona_data_collector import load_from_db, add_gps_coordinates, export_corona_bot_answers from avid_covider_pipelines.utils import get_parameters_from_pipeline_spec from dataflows import printer, Flow, load from .common import test_corona_bot_answers logging.basicConfig(level=logging.INFO) Flow( load_from_db.flow({ "where": "id in (1199, 6406, 686719, 672579, 650000)", }), add_gps_coordinates.flow({ "source_fields": get_parameters_from_pipeline_spec( "pipeline-spec.yaml", "corona_data_collector", "corona_data_collector.add_gps_coordinates")["source_fields"], "get-coords-callback": lambda street, city: (random.uniform(29, 34), random.uniform(34, 36), int(street != city)) }), export_corona_bot_answers.flow({ "destination_output": "data/corona_data_collector/destination_output" }), printer(fields=[ "__id", "__created", "main_age", "diagnosed_location", "hospitalized" ])).process() Flow(