示例#1
0
 def handle_dataframe_per_file(cls, data, fname):
     data["playingSince"] = [
         datetime_from_timestamp(x, "utc") for x in data["playingSince"]
     ]
     data["playingUntil"] = [
         datetime_from_timestamp(x, "utc") for x in data["playingUntil"]
     ]
     return data
示例#2
0
 def handle_dataframe_per_file(cls, df, fname):
     if df.empty:
         return None
     df["start"] = [datetime_from_timestamp(x) if isinstance(x, int) else tz.localize(x) for x in df.start_time]
     df["end"] = [datetime_from_timestamp(x) if isinstance(x, int) else tz.localize(x) for x in df.end_time]
     del df["start_time"]
     del df["end_time"]
     return df
示例#3
0
    def load_image_texts(cls, glob_pattern_s, nrows=None):
        import pytesseract
        from PIL import Image

        if isinstance(glob_pattern_s, list):
            fnames = set()
            for glob_pattern in glob_pattern_s:
                fnames.update(set(just.glob(glob_pattern)))
            glob_pattern = "_".join(glob_pattern_s)
        else:
            fnames = set(just.glob(glob_pattern))
        name = glob_pattern + "_" + normalize_name(cls.__name__)
        processed_files = get_processed_files(name)
        to_process = fnames.difference(processed_files)
        objects = []

        cache = get_cache("tesseract")

        if nrows is not None:
            if not to_process:
                return load_df(name).iloc[-nrows:]
            else:
                to_process = list(to_process)[-nrows:]
        if to_process:
            for fname in to_process:
                if fname in cache:
                    text = cache[fname]
                else:
                    try:
                        text = pytesseract.image_to_string(
                            Image.open(just.make_path(fname)))
                    except OSError as e:
                        print("ERR", fname, e)
                        continue
                    cache[fname] = text
                time = datetime_from_timestamp(os.path.getmtime(fname), "utc")
                data = {
                    "text": text,
                    "path": fname,
                    "title": fname.split("/")[-1],
                    "time": time
                }
                objects.append(data)
            data = pd.DataFrame(objects)
            if processed_files and nrows is None:
                data = pd.concat((data, load_df(name)))
            for x in ["time", "start", "end"]:
                if x in data:
                    data = data.sort_values(x)
                    break
            if nrows is None:
                save_df(data, name)
                save_processed_files(fnames | processed_files, name)
        else:
            data = load_df(name)
        if nrows is not None:
            data = data.iloc[-nrows:]
        return data
示例#4
0
 def load(cls, file_path="~/nostalgia_data/input/shazam.json", nrows=None):
     shazam = pd.DataFrame(
         [(
             datetime_from_timestamp(x["timestamp"], x["timezone"]),
             x["track"]["heading"]["title"],
             x["track"]["heading"]["subtitle"],
         ) for x in just.read(file_path)["tags"]],
         columns=["time", "title", "artist"],
     )
     return cls(shazam)
示例#5
0
    def object_to_row(cls, x):
        import tldextract

        if x["url"]:
            extract = tldextract.extract(x["url"])
            x["domain"] = extract.domain
            x["domain_and_suffix"] = extract.domain + "." + extract.suffix
        x["url"] = x["url"] or "_".join(x["path"].split("_")[1:])
        x["time"] = datetime_from_timestamp(x["time"])
        x["title"] = get_title(x)
        return x
示例#6
0
    def ingest(cls, author):
        from psaw import PushshiftAPI

        api = PushshiftAPI()

        posts = [{
            "title": x.title,
            "time": datetime_from_timestamp(x.created_utc),
            "url": x.full_link,
            "text": x.selftext,
            "author": author,
        } for x in api.search_submissions(author=author)]

        cls.save_df(posts)
示例#7
0
 def handle_json(cls, data):
     posts = []
     for post in data:
         if "data" not in post or not isinstance(post["data"], list):
             continue
         location = "self"
         title = post.get("title", "")
         location_res = re.findall(
             "(?:on|to) ([^']+)'s? [tT]imeline|posted in ([^.]+)|was with ([^.]+)[.]$",
             title)
         if location_res:
             location = [x for x in location_res[0] if x][0]
         for x in post["data"]:
             if "post" not in x:
                 continue
             row = {
                 "location": location,
                 "title": x["post"],
                 "time": datetime_from_timestamp(post['timestamp']),
             }
             posts.append(row)
     return posts