Exemplo n.º 1
0
def extract_and_save(args):
    file_name, file_content = args
    url = file_content["url"]
    html = file_content["html"]
    parsed = parse_article(html, url)
    just.write(parsed,
               "~/.nostalgia_chrome/metadata/" + file_name.split("/")[-1])
Exemplo n.º 2
0
def get_linked_data(x):
    path = x["path"]
    if path in CACHE:
        return CACHE[path]
    try:
        html = just.read(path)
    except EOFError:
        CACHE[path] = None
        return None
    if not html.strip():
        CACHE[path] = None
        return None
    art = parse_article(html, x["url"])
    linked_data = get_linked_data_md(art)
    if linked_data is None:
        linked_data = get_linked_data_jd(art)
    CACHE[path] = linked_data
    return linked_data
Exemplo n.º 3
0
def get_linked_data(x):
    path = x["path"]
    if path in CACHE:
        return CACHE[path]
    try:
        html = just.read(path)
    except EOFError:
        CACHE[path] = None
        return None
    if not html.strip():
        CACHE[path] = None
        return None
    art = parse_article(html, x["url"])
    if "youtube" not in art.domain:
        return None
    title = re.sub(" - YouTube$", "", art.tree.xpath("//title/text()")[0])
    if title == "YouTube":
        CACHE[path] = None
        return None
    if not title:
        return None
    vc = art.tree.xpath("//span[contains(@class, 'view-count')]/text()")
    vc = re.sub("[^0-9]", "", vc[0]) if vc else None
    watch_part = urllib.parse.parse_qs(urllib.parse.urlparse(
        x["url"]).query)["v"]
    if watch_part:
        image = "http://i3.ytimg.com/vi/{}/maxresdefault.jpg".format(
            watch_part[0])
    else:
        image = None
    channel = art.tree.xpath("//ytd-video-owner-renderer//a/text()")
    if not channel:
        channel = art.tree.xpath("//ytd-channel-name//a/text()")
    channel = " ".join(channel)
    linked_data = {
        "title": title,
        "type": "video",
        "source": "youtube",
        "image": image,
        "view_count": vc,
        "channel": channel,
    }
    CACHE[path] = linked_data
    return linked_data
Exemplo n.º 4
0
import gzip
import os
import just
from auto_extract import parse_article
import tqdm
from urllib.parse import urlparse
import tldextract
from utils import KEYS_TO_KEEP

for x in tqdm.tqdm(just.glob("/home/pascal/.nostalgia/meta/v1/*.json")):
    print("processing", x)
    meta = just.read(x)
    if "extruct" in meta:
        print("skipping", x)
        continue
    html_path = "/home/pascal/.nostalgia/html/" + x.split("/")[-1].rstrip(
        ".json") + ".html.gz"
    if os.path.exists(html_path):
        with gzip.GzipFile(html_path, "r") as f:
            html = f.read()
        article = parse_article(html, meta["url"])
        meta = article.to_dict(keys=KEYS_TO_KEEP, skip_if_empty=True)
        just.write(meta, x)
        os.system("touch '{}' -r '{}'".format(x, html_path))
        print("done", x)
Exemplo n.º 5
0
def slug_url(url):
    pre_slug = re.sub(r"[-\s]+", "-", url)
    slugged_url = re.sub(r"[^\w\s-]", "", pre_slug).strip().lower()[-150:]
    return slugged_url


for x in tqdm.tqdm(
        just.glob("/home/pascal/.nostalgia_chrome/old/html/*.json")):
    ctime = os.path.getctime(x)
    with open(x) as f:
        print("processing", x)
        data = json.load(f)
        html = data["html"]
        url = data["url"]
        slugged_url = slug_url(url)
        article = parse_article(html, url)
        meta = article.to_dict(keys=KEYS_TO_KEEP, skip_if_empty=True)
        meta["creation_time"] = ctime
        meta["slugged_url"] = slugged_url
        html_path = "/home/pascal/.nostalgia_chrome/html/{}_{}.html.gz".format(
            ctime, slugged_url)
        with gzip.GzipFile(html_path, "w") as f:
            f.write(html.encode("utf8"))
        meta_path = "/home/pascal/.nostalgia_chrome/meta/v1/{}_{}.json".format(
            ctime, slugged_url)
        just.write(meta, meta_path)
        os.system("touch '{}' -r '{}'".format(html_path, x))
        os.system("touch '{}' -r '{}'".format(meta_path, x))
        just.remove(x)