def events() -> Iterator[Event]: with match_structure(input(), expected=EXPECTED, partial=True) as exports: for exp in exports: for csv_file in exp.rglob("*"): if not csv_file.is_file(): continue yield from _csv_to_json(csv_file)
def events(disable_takeout_cache: bool = DISABLE_TAKEOUT_CACHE) -> CacheResults: count = 0 emitted = GoogleEventSet() # reversed shouldn't really matter? but logic is to use newer # takeouts if they're named according to date, since JSON Activity # is nicer than HTML Activity for path in reversed(inputs()): with match_structure(path, expected=EXPECTED, partial=True) as results: for m in results: # e.g. /home/sean/data/google_takeout/Takeout-1634932457.zip") -> 'Takeout-1634932457' # means that zipped takeouts have nice filenames from cachew cw_id, _, _ = path.name.rpartition(".") # each takeout result is cached as well, in individual databases per-type tk = TakeoutParser(m, cachew_identifier=cw_id, error_policy="drop") for event in tk.parse(cache=not disable_takeout_cache): count += 1 if isinstance(event, Exception): continue if event in emitted: continue emitted.add(event) yield event # type: ignore[misc] logger.debug( f"HPI Takeout merge: from a total of {count} events, removed {count - len(emitted)} duplicates" )
def test_gdpr_unzip() -> None: with match_structure(structure_data / "gdpr_export.zip", expected=gdpr_expected) as results: assert len(results) == 1 extracted = results[0] index_file = extracted / "messages" / "index.csv" assert index_file.read_text().strip() == "test message" # make sure the temporary directory this created no longer exists assert not extracted.exists()
def activity() -> Iterator[Activity]: emitted: Set[str] = set() for exp in get_files(config.export_path): with match_structure( exp, expected=EXPECTED_DISCORD_STRUCTURE) as discord_export: for activity_dir in [d / "activity" for d in discord_export]: for act in parse_activity(activity_dir): if act.event_id in emitted: continue yield act emitted.add(act.event_id)
def messages() -> Iterator[Message]: emitted: Set[int] = set() for exp in get_files(config.export_path): with match_structure( exp, expected=EXPECTED_DISCORD_STRUCTURE) as discord_export: for message_dir in [d / "messages" for d in discord_export]: for msg in parse_messages(message_dir): if msg.message_id in emitted: continue yield Message( message_id=msg.message_id, timestamp=msg.timestamp, channel=msg.channel, content=_remove_link_suppression(msg.content), attachments=msg.attachments, ) emitted.add(msg.message_id)
def events(disable_takeout_cache: bool = DISABLE_TAKEOUT_CACHE) -> CacheResults: error_policy = config.error_policy count = 0 emitted = GoogleEventSet() # reversed shouldn't really matter? but logic is to use newer # takeouts if they're named according to date, since JSON Activity # is nicer than HTML Activity for path in reversed(inputs()): with ExitStack() as exit_stack: if config._use_zippath: from my.core.kompress import ZipPath # for later takeouts it's just 'Takeout' dir, # but for older (pre 2015) it contains email/date in the subdir name results = tuple(cast(Sequence[Path], ZipPath(path).iterdir())) else: results = exit_stack.enter_context(match_structure(path, expected=EXPECTED, partial=True)) for m in results: # e.g. /home/sean/data/google_takeout/Takeout-1634932457.zip") -> 'Takeout-1634932457' # means that zipped takeouts have nice filenames from cachew cw_id, _, _ = path.name.rpartition(".") # each takeout result is cached as well, in individual databases per-type tk = TakeoutParser(m, cachew_identifier=cw_id, error_policy=error_policy) # TODO might be nice to pass hpi cache dir? for event in tk.parse(cache=not disable_takeout_cache): count += 1 if isinstance(event, Exception): if error_policy == 'yield': yield event elif error_policy == 'raise': raise event elif error_policy == 'drop': pass continue if event in emitted: continue emitted.add(event) yield event # type: ignore[misc] logger.debug( f"HPI Takeout merge: from a total of {count} events, removed {count - len(emitted)} duplicates" )
def test_not_directory() -> None: with pytest.raises(NotADirectoryError, match=r"Expected either a zipfile or a directory"): with match_structure(structure_data / "messages/index.csv", expected=gdpr_expected): pass
def test_match_partial() -> None: # a partial match should match both the 'broken' and 'gdpr_export' directories with match_structure(structure_data / "gdpr_subdirs", expected=gdpr_expected, partial=True) as results: assert len(results) == 2
def test_gdpr_structure_exists() -> None: with match_structure(structure_data, expected=gdpr_expected) as results: assert results == (structure_data / "gdpr_subdirs" / "gdpr_export", )
def accounts() -> Sequence[Path]: accounts = [] for f in get_files(config.export_path): with match_structure(f, EXPECTED) as match: accounts.extend(list(match)) return accounts
def export_dirs() -> List[Path]: base: Path = Path(config.export_path).expanduser().absolute() with match_structure(base, expected="animelist.xml") as matches: return list(matches)