def minicans(test_name, start_date: date, end_date: date, end=None): """Fetches minicans from S3 and iterates over measurements. Detect broken dloads. """ s3 = s3feeder.create_s3_client() day = start_date file_cnt = 0 while day <= end_date: tn_filter = set([test_name.replace("_", "")]) log.info(day) li = s3feeder.list_minicans_on_s3_for_a_day(s3, day, None, tn_filter) for s3fname, s3size in li: # s3fname: raw/20210426/23/YE/ndt/2021042623_YE_ndt.n0.0.tar.gz local_file = Path("testdata") / "mini" / s3fname in_cache = local_file.is_file() and (local_file.stat().st_size == s3size) if not in_cache: # Download minican log.debug("Downloading can %s of size %d KB" % (s3fname, s3size / 1024)) local_file.parent.mkdir(parents=True, exist_ok=True) with local_file.open("wb") as f: s3.download_fileobj(s3feeder.MC_BUCKET_NAME, s3fname, f) assert s3size == local_file.stat().st_size log.debug("Loading %s", s3fname) for msm_jstr, msm, _ in s3feeder.load_multiple( local_file.as_posix()): msm = msm or ujson.loads(msm_jstr) yield local_file.as_posix(), msm file_cnt += 1 if end is not None and file_cnt == end: return
def list_cans_on_s3_for_a_day(day, filter=None, bysize=False): s3 = s3feeder.create_s3_client() fns = s3feeder.list_cans_on_s3_for_a_day(s3, day) if bysize: fns = sorted(fns, key=lambda i: i[1]) else: fns = sorted(fns) for fn, size in fns: size = size / float(2**20) if filter is None or (filter in fn): print(f"{fn:<160} {size} MB")
def cans(): """ Download interesting cans from S3 to a local directory Uses credentials from ~/.aws/config in the block: [ooni-data-private] aws_access_key_id = ... aws_secret_access_key = ... Explore bucket from CLI: AWS_PROFILE=ooni-data-private aws s3 ls s3://ooni-data-private/canned/2019-07-16/ """ _cans = dict( # "2013-05-05/20130505T065438Z-VN-AS24173-captive_portal-no_report_id-0.1.0-probe.yaml.lz4", # "2013-09-12/20130912T150305Z-MD-AS1547-http_requests-no_report_id-0.1.0-probe.yaml.lz4", vn= "2013-05-05/20130505T103213Z-VN-AS24173-http_requests-no_report_id-0.1.0-probe.yaml.lz4", yaml16= "2016-07-07/20160706T000046Z-GB-AS9105-http_requests-TYXZLcFg4yUp9Io2LrOMM7CjLk0QcIdsMPiCZtVgkxUrTxnFM0GiMbr8iGDl3OEe-0.1.0-probe.yaml.lz4", yaml17= "2017-12-21/20171220T153044Z-BE-AS5432-dns_consistency-mnKRlHuqk8Eo6XMJt5ZkVQrgReaEXPEWaO9NafgXxSVIhAswTXT7QJc6zhsuttpK-0.1.0-probe.yaml.lz4", yaml18= "2018-03-21/20180320T211810Z-NL-AS1103-dns_consistency-yiCRUmXy6MndqnV3g5QYBKGich5OwP9cQQfOiYnxYAfZatgQZlStuWIT30yu586R-0.1.0-probe.yaml.lz4", # yaml2014hr1="2014-02-20/http_requests.1.tar.lz4", yaml2014dns="2014-02-20/dns_consistency.0.tar.lz4", # yaml2014mpt="2014-02-20/multi_protocol_traceroute.0.tar.lz4", yaml2014hr0="2014-02-20/http_requests.0.tar.lz4", yaml2014hh="2014-02-20/http_host.0.tar.lz4", yaml2014hfm="2014-02-20/http_header_field_manipulation.0.tar.lz4", ) for k, v in _cans.items(): _cans[k] = Path("testdata") / v to_dload = sorted(f for f in _cans.values() if not f.is_file()) if not to_dload: return _cans bname = "ooni-data" s3 = create_s3_client() for fn in to_dload: s3fname = fn.as_posix().replace("testdata", "canned") r = s3.list_objects_v2(Bucket=bname, Prefix=s3fname) assert r["KeyCount"] == 1, r filedesc = r["Contents"][0] size = filedesc["Size"] print("Downloading can %s size %d MB" % (fn, size / 1024 / 1024)) os.makedirs(os.path.dirname(fn), exist_ok=True) with open(fn, "wb") as f: s3.download_fileobj(bname, s3fname, f) assert size == os.path.getsize(fn) return _cans
def main(): conf = parse_args() format_char = "n" collector_id = "L" identity = f"{format_char}{collector_id}" log.info(f"From bucket {conf.src_bucket} to {conf.dst_bucket}") s3sig = create_s3_client(conf) # signed client for writing db_conn = psycopg2.connect(conf.db_uri) db.setup(conf) # setup db conn inside db module setup_fingerprints() # Fetch msmts for one day buf = {} # "<cc> <testname>" -> jsonlf / fd / jsonl_s3path seen_uids = set() # Avoid uploading duplicates # raw/20210601/00/SA/webconnectivity/2021060100_SA_webconnectivity.n0.0.jsonl.gz # jsonl_s3path = f"raw/{ts}/00/{cc}/{testname}/{jsonlf.name}" s3uns = s3f.create_s3_client() # unsigned client for reading cans_fns = s3f.list_cans_on_s3_for_a_day(s3uns, conf.day) cans_fns = sorted(cans_fns) # this is not enough to sort by time tot_size = sum(size for _, size in cans_fns) processed_size = 0 log.info(f"{tot_size/1024/1024/1024} GB to process") log.info(f"{len(cans_fns)} cans to process") # TODO make assertions on msmt # TODO add consistency check on trivial id found in fastpath table for can in cans_fns: can_fn, size = can log.info(f"Processed percentage: {100 * processed_size / tot_size}") log.info(f"Opening can {can_fn}") Path(can_fn).parent.mkdir(parents=True, exist_ok=True) s3uns.download_file(conf.src_bucket, can_fn, can_fn) for msm_tup in s3f.load_multiple(can_fn): process_measurement(msm_tup, buf, seen_uids, conf, s3sig, db_conn) processed_size += size Path(can_fn).unlink() log.info("Finish jsonl files still open") for json_entities in buf.values(): for e in json_entities: if e.fd.closed: continue finalize_jsonl(s3sig, db_conn, conf, e) log.info("Exiting")
def s3msmts(test_name, start_date=date(2018, 1, 1), end_date=date(2019, 11, 4)): """Fetches cans from S3 and iterates over measurements. Detect broken dloads. """ s3 = s3feeder.create_s3_client() can_date = start_date tpl = "{}/{}.00.tar.lz4" if test_name == "web_connectivity" else "{}/{}.0.tar.lz4" while can_date <= end_date: # e.g. 2019-10-30/psiphon.0.tar.lz4 can_fname = tpl.format(can_date.strftime("%Y-%m-%d"), test_name) can_date += timedelta(days=1) can_local_file = Path("testdata") / can_fname s3fname = "canned/" + can_fname r = s3.list_objects_v2(Bucket=BUCKET_NAME, Prefix=s3fname) if r["KeyCount"] != 1: log.info("Can %s not found. Skipping." % s3fname) continue s3size = r["Contents"][0]["Size"] assert s3size > 0 ready = can_local_file.is_file() and (can_local_file.stat().st_size == s3size) if not ready: # Download can log.debug("Downloading can %s of size %d MB" % (can_fname, s3size / 1024 / 1024)) can_local_file.parent.mkdir(exist_ok=True) with can_local_file.open("wb") as f: s3.download_fileobj(BUCKET_NAME, s3fname, f) assert s3size == can_local_file.stat().st_size log.debug("Loading %s", s3fname) for msm_jstr, msm, _ in s3feeder.load_multiple( can_local_file.as_posix()): msm = msm or ujson.loads(msm_jstr) if msm.get("report_id", None) is None: # Missing or empty report_id # https://github.com/ooni/probe-engine/pull/104 continue yield can_fname, msm
def cans(): """Download interesting cans from S3 to a local directory """ # TODO: move to the more flexible s3msmts where possible _cans = dict( web_conn_it= "2018-05-07/20180501T071932Z-IT-AS198471-web_connectivity-20180506T090836Z_AS198471_gKqEpbg0Ny30ldGCQockbZMJSg9HhFiSizjey5e6JxSEHvzm7j-0.2.0-probe.json.lz4", web_conn_cn= "2018-05-07/20180506T014008Z-CN-AS4134-web_connectivity-20180506T014010Z_AS4134_ZpxhAVt3iqCjT5bW5CfJspbqUcfO4oZfzDVjCWAu2UuVkibFsv-0.2.0-probe.json.lz4", web_conn_30="2019-10-30/web_connectivity.00.tar.lz4", telegram="2019-08-29/telegram.0.tar.lz4", whatsapp="2019-08-29/whatsapp.0.tar.lz4", facebook_messenger="2019-08-29/facebook_messenger.0.tar.lz4", facebook_messenger2="2019-10-29/facebook_messenger.0.tar.lz4", # telegram="2019-08-29/20190829T105210Z-IR-AS31549-telegram-20190829T105214Z_AS31549_t32ZZ5av3B6yNruRIFhCnuT1dHTnwPk7vwIa9F0TAe064HG4tk-0.2.0-probe.json", # fb="2019-06-27/20190627T214121Z-ET-AS24757-facebook_messenger-20190627T214126Z_AS24757_h8g9P5kTmmzyX1VyOjqcVonIbFNujm84l2leMCwC2gX3BI78fI-0.2.0-probe.json", hhfm_2019_10_26="2019-10-26/http_header_field_manipulation.0.tar.lz4", hhfm_2019_10_27="2019-10-27/http_header_field_manipulation.0.tar.lz4", hhfm_2019_10_28="2019-10-28/http_header_field_manipulation.0.tar.lz4", hhfm_2019_10_29="2019-10-29/http_header_field_manipulation.0.tar.lz4", tor_2018_10_26="2018-10-26/vanilla_tor.0.tar.lz4", tor_2019_10_26="2019-10-26/vanilla_tor.0.tar.lz4", tor_2019_10_27="2019-10-27/vanilla_tor.0.tar.lz4", tor_2019_10_28="2019-10-28/vanilla_tor.0.tar.lz4", tor_2019_10_29="2019-10-29/vanilla_tor.0.tar.lz4", ndt_2018_10_26="2018-10-26/ndt.0.tar.lz4", tcp_connect_2018_10_26="2018-10-26/tcp_connect.0.tar.lz4", dash_2019_10_26="2019-10-26/dash.0.tar.lz4", dash_2019_10_27="2019-10-27/dash.0.tar.lz4", dash_2019_10_28="2019-10-28/dash.0.tar.lz4", dash_2019_10_29="2019-10-29/dash.0.tar.lz4", meek_2019_10_26="2019-10-26/meek_fronted_requests_test.0.tar.lz4", meek_2019_10_27="2019-10-27/meek_fronted_requests_test.0.tar.lz4", meek_2019_10_28="2019-10-28/meek_fronted_requests_test.0.tar.lz4", meek_2019_10_29="2019-10-29/meek_fronted_requests_test.0.tar.lz4", big2858= "2019-10-30/20191030T032301Z-BR-AS28573-web_connectivity-20191030T032303Z_AS28573_VzW6UrXrs21YjYWvlk1hyzRqnKlmKNsSntSBGqFCnzFVxVSLQf-0.2.0-probe.json.lz4", ) for k, v in _cans.items(): _cans[k] = Path("testdata") / v to_dload = sorted(f for f in _cans.values() if not f.is_file()) if not to_dload: return _cans s3 = s3feeder.create_s3_client() for fn in to_dload: s3fname = fn.as_posix().replace("testdata", "canned") r = s3.list_objects_v2(Bucket=BUCKET_NAME, Prefix=s3fname) assert r["KeyCount"] == 1, fn assert r["KeyCount"] == 1, r filedesc = r["Contents"][0] size = filedesc["Size"] print("Downloading can %s size %d MB" % (fn, size / 1024 / 1024)) os.makedirs(os.path.dirname(fn), exist_ok=True) with open(fn, "wb") as f: s3.download_fileobj(BUCKET_NAME, s3fname, f) assert size == os.path.getsize(fn) return _cans