def read_browser_history_json(takeout: TakeoutPath) -> Iterable[Visit]: # TODO replace with my.core.kompress after hpi update (or even use some my. function directly?) from my.kython.kompress import kexists, kopen # not sure if this deserves moving to HPI? it's pretty trivial for now spath = 'Takeout/Chrome/BrowserHistory.json' if not kexists(takeout, spath): logger.warning(f"{spath} is not present in {takeout}... skipping") return logger.info('processing %s %s', takeout, spath) # TODO couls also add spath? locator = Loc.file(takeout) # TODO this should be supported by HPI now? j = None with kopen(takeout, spath) as fo: # TODO iterative parser? j = json.load(fo) hist = j['Browser History'] for item in hist: url = item['url'] time = datetime.utcfromtimestamp(item['time_usec'] / 10**6).replace(tzinfo=pytz.utc) # TODO any more interesitng info? yield Visit( url=url, dt=time, locator=locator, debug='Chrome/BrowserHistory.json', )
def _read_myactivity_html(takeout: TakeoutPath, kind: str) -> Iterable[Visit]: # TODO replace with my.core.kompress after hpi update (or even use some my. function directly?) from my.kython.kompress import kexists # TODO glob # TODO not sure about windows path separators?? spath = 'Takeout/My Activity/' + kind if not kexists(takeout, spath): logger.warning(f"{spath} is not present in {takeout}... skipping") return logger.info('processing %s %s', takeout, kind) locator = Loc.file(spath) from my.google.takeout.html import read_html for dt, url, title in read_html(takeout, spath): yield Visit( url=url, dt=dt, locator=locator, debug=kind, )
def _read_myactivity_html(takeout: TakeoutPath, kind: str) -> Iterable[Visit]: from my.kython.kompress import kexists logger = get_logger() # TODO glob # TODO not sure about windows path separators?? spath = 'Takeout/My Activity/' + kind if not kexists(takeout, spath): logger.warning(f"{spath} is not present in {takeout}... skipping") return [] logger.info('processing %s %s', takeout, kind) locator = Loc.file(spath) from my.google.takeout.html import read_html for dt, url, title in read_html(takeout, spath): yield Visit( url=url, dt=dt, locator=locator, debug=kind, )
def test_kexists(tmp_path: Path) -> None: assert kexists(str(tmp_path / 'file.zip'), 'path/in/archive') assert not kexists(str(tmp_path / 'file.zip'), 'path/notin/archive') # TODO not sure about this? assert not kexists(tmp_path / 'nosuchzip.zip', 'path/in/archive')