def handle_dataframe_per_file(cls, data, fname): data["playingSince"] = [ datetime_from_timestamp(x, "utc") for x in data["playingSince"] ] data["playingUntil"] = [ datetime_from_timestamp(x, "utc") for x in data["playingUntil"] ] return data
def handle_dataframe_per_file(cls, df, fname): if df.empty: return None df["start"] = [datetime_from_timestamp(x) if isinstance(x, int) else tz.localize(x) for x in df.start_time] df["end"] = [datetime_from_timestamp(x) if isinstance(x, int) else tz.localize(x) for x in df.end_time] del df["start_time"] del df["end_time"] return df
def load_image_texts(cls, glob_pattern_s, nrows=None): import pytesseract from PIL import Image if isinstance(glob_pattern_s, list): fnames = set() for glob_pattern in glob_pattern_s: fnames.update(set(just.glob(glob_pattern))) glob_pattern = "_".join(glob_pattern_s) else: fnames = set(just.glob(glob_pattern)) name = glob_pattern + "_" + normalize_name(cls.__name__) processed_files = get_processed_files(name) to_process = fnames.difference(processed_files) objects = [] cache = get_cache("tesseract") if nrows is not None: if not to_process: return load_df(name).iloc[-nrows:] else: to_process = list(to_process)[-nrows:] if to_process: for fname in to_process: if fname in cache: text = cache[fname] else: try: text = pytesseract.image_to_string( Image.open(just.make_path(fname))) except OSError as e: print("ERR", fname, e) continue cache[fname] = text time = datetime_from_timestamp(os.path.getmtime(fname), "utc") data = { "text": text, "path": fname, "title": fname.split("/")[-1], "time": time } objects.append(data) data = pd.DataFrame(objects) if processed_files and nrows is None: data = pd.concat((data, load_df(name))) for x in ["time", "start", "end"]: if x in data: data = data.sort_values(x) break if nrows is None: save_df(data, name) save_processed_files(fnames | processed_files, name) else: data = load_df(name) if nrows is not None: data = data.iloc[-nrows:] return data
def load(cls, file_path="~/nostalgia_data/input/shazam.json", nrows=None): shazam = pd.DataFrame( [( datetime_from_timestamp(x["timestamp"], x["timezone"]), x["track"]["heading"]["title"], x["track"]["heading"]["subtitle"], ) for x in just.read(file_path)["tags"]], columns=["time", "title", "artist"], ) return cls(shazam)
def object_to_row(cls, x): import tldextract if x["url"]: extract = tldextract.extract(x["url"]) x["domain"] = extract.domain x["domain_and_suffix"] = extract.domain + "." + extract.suffix x["url"] = x["url"] or "_".join(x["path"].split("_")[1:]) x["time"] = datetime_from_timestamp(x["time"]) x["title"] = get_title(x) return x
def ingest(cls, author): from psaw import PushshiftAPI api = PushshiftAPI() posts = [{ "title": x.title, "time": datetime_from_timestamp(x.created_utc), "url": x.full_link, "text": x.selftext, "author": author, } for x in api.search_submissions(author=author)] cls.save_df(posts)
def handle_json(cls, data): posts = [] for post in data: if "data" not in post or not isinstance(post["data"], list): continue location = "self" title = post.get("title", "") location_res = re.findall( "(?:on|to) ([^']+)'s? [tT]imeline|posted in ([^.]+)|was with ([^.]+)[.]$", title) if location_res: location = [x for x in location_res[0] if x][0] for x in post["data"]: if "post" not in x: continue row = { "location": location, "title": x["post"], "time": datetime_from_timestamp(post['timestamp']), } posts.append(row) return posts