示例#1
0
    def load_image_texts(cls, glob_pattern_s, nrows=None):
        import pytesseract
        from PIL import Image

        if isinstance(glob_pattern_s, list):
            fnames = set()
            for glob_pattern in glob_pattern_s:
                fnames.update(set(just.glob(glob_pattern)))
            glob_pattern = "_".join(glob_pattern_s)
        else:
            fnames = set(just.glob(glob_pattern))
        name = glob_pattern + "_" + normalize_name(cls.__name__)
        processed_files = get_processed_files(name)
        to_process = fnames.difference(processed_files)
        objects = []

        cache = get_cache("tesseract")

        if nrows is not None:
            if not to_process:
                return load_df(name).iloc[-nrows:]
            else:
                to_process = list(to_process)[-nrows:]
        if to_process:
            for fname in to_process:
                if fname in cache:
                    text = cache[fname]
                else:
                    try:
                        text = pytesseract.image_to_string(
                            Image.open(just.make_path(fname)))
                    except OSError as e:
                        print("ERR", fname, e)
                        continue
                    cache[fname] = text
                time = datetime_from_timestamp(os.path.getmtime(fname), "utc")
                data = {
                    "text": text,
                    "path": fname,
                    "title": fname.split("/")[-1],
                    "time": time
                }
                objects.append(data)
            data = pd.DataFrame(objects)
            if processed_files and nrows is None:
                data = pd.concat((data, load_df(name)))
            for x in ["time", "start", "end"]:
                if x in data:
                    data = data.sort_values(x)
                    break
            if nrows is None:
                save_df(data, name)
                save_processed_files(fnames | processed_files, name)
        else:
            data = load_df(name)
        if nrows is not None:
            data = data.iloc[-nrows:]
        return data
示例#2
0
import re
import just
import numpy as np
import pandas as pd
from nostalgia.ndf import NDF
import lxml.html
import diskcache
from auto_extract import parse_article
from nostalgia.nlp import nlp
from nostalgia.cache import get_cache
from nostalgia.data_loading import read_array_of_dict_from_json
from nostalgia.times import datetime_from_timestamp

CACHE = get_cache("chrome_history")

# def destroy_tree(tree):
#     node_tracker = {tree: [0, None]}

#     for node in tree.iterdescendants():
#         parent = node.getparent()
#         node_tracker[node] = [node_tracker[parent][0] + 1, parent]

#     node_tracker = sorted(
#         [(depth, parent, child) for child, (depth, parent) in node_tracker.items()],
#         key=lambda x: x[0],
#         reverse=True,
#     )

#     for _, parent, child in node_tracker:
#         if parent is None:
#             break
示例#3
0
import os
from googleapiclient.discovery import build
from googleapiclient.http import HttpError
import dotenv
from nostalgia.cache import get_cache

CACHE = get_cache("google_custom_search")

dotenv.load_dotenv("google_custom_search/.env")
dotenv.load_dotenv(".env")

errored_count = 0


def google_custom_search(search_term, **kwargs):
    global errored_count
    search_term = search_term.lower()
    if search_term in CACHE:
        return CACHE[search_term]
    if errored_count > 4:
        return []
    service = build("customsearch",
                    "v1",
                    developerKey=os.environ["MY_API_KEY"])
    try:
        res = service.cse().list(q=search_term,
                                 cx=os.environ["MY_CSE_ID"],
                                 **kwargs).execute()
    except HttpError as e:
        print("error", e)
        errored_count += 1
示例#4
0
import just

import pandas as pd
import lxml.html

from nostalgia.cache import get_cache
from nostalgia.ndf import NDF

from datetime import datetime
from nostalgia.times import tz

CACHE = get_cache("linked_google_search")


def get_linked_data(x):
    path = x["path"]
    if path in CACHE:
        return CACHE[path]
    try:
        html = just.read(path)
    except EOFError:
        CACHE[path] = None
        return None
    if not html.strip():
        CACHE[path] = None
        return None
    tree = lxml.html.fromstring(html)
    res = tree.xpath("//input[@name='q' and @type='text']")
    if not res:
        linked_data = None
    else:
示例#5
0
import json
from datetime import datetime
import just
import pandas as pd

from nostalgia.times import tz

from auto_extract import parse_article

from nostalgia.cache import get_cache
from nostalgia.ndf import NDF

from nostalgia.utils import normalize_name

CACHE = get_cache("linked_events")


def getter(dc, key, default=None):
    res = dc.get(key, default)
    if isinstance(res, list):
        res = res[0]
    elif isinstance(res, dict):
        res = json.dumps(res)
    return res


def get_linked_data_jd(art):
    data = None
    try:
        jdata = art.jsonld
    except json.JSONDecodeError:
示例#6
0
import json
from datetime import datetime
import just
import pandas as pd

from nostalgia.times import tz

from auto_extract import parse_article

from nostalgia.cache import get_cache
from nostalgia.ndf import NDF

from nostalgia.utils import normalize_name

CACHE = get_cache("linked_person")


def getter(dc, key, default=None):
    res = dc.get(key, default)
    if isinstance(res, list):
        res = res[0]
    elif isinstance(res, dict):
        res = json.dumps(res)
    return res


def get_linked_data_jd(art):
    data = None
    try:
        jdata = art.jsonld
    except json.JSONDecodeError:
示例#7
0
from datetime import datetime
from urllib.parse import urljoin
import just
import pandas as pd

from nostalgia.utils import parse_price
from nostalgia.times import tz
from nostalgia.nlp import nlp
from nostalgia.ndf import NDF

from auto_extract import parse_article
from nostalgia.sources.web.get_keywords_for_product import get_keywords_for_product

from nostalgia.cache import get_cache

CACHE = get_cache("linked_offers")


def getter(dc, key, default=None):
    res = dc.get(key, default)
    if isinstance(res, list):
        res = res[0]
    elif isinstance(res, dict):
        res = json.dumps(res)
    return res


from natura import Finder

finder = Finder()
示例#8
0
import just

import urllib.parse

import pandas as pd

from auto_extract import parse_article

from nostalgia.cache import get_cache

from datetime import datetime
from nostalgia.times import tz
from nostalgia.ndf import NDF
from nostalgia.nlp import nlp

CACHE = get_cache("linked_data_videos")


def get_linked_data(x):
    path = x["path"]
    if path in CACHE:
        return CACHE[path]
    try:
        html = just.read(path)
    except EOFError:
        CACHE[path] = None
        return None
    if not html.strip():
        CACHE[path] = None
        return None
    art = parse_article(html, x["url"])
示例#9
0
文件: api.py 项目: forksbot/nostalgia
import os
import requests
import dotenv
from nostalgia.cache import get_cache

CACHE = get_cache("darksky_weather")

dotenv.load_dotenv(".env")
dotenv.load_dotenv("~/nostalgia_data/.env")


def _historic_weather(latitude, longitude, epoch_time):
    q = f"{latitude},{longitude},{epoch_time}"
    if q in CACHE:
        return CACHE[q]
    key = os.environ["DARKSKY_WEATHER_KEY"]
    resp = requests.get(f"https://api.darksky.net/forecast/{key}/{q}?units=si")
    json_response = resp.json()
    if "error" not in json_response:
        CACHE[q] = json_response
    return json_response


def get_weather_at_nearest_hour(latitude, longitude, dt):
    day_timestamp = int(dt.replace(hour=0, minute=0, second=1).timestamp())
    json_response = _historic_weather(latitude, longitude, day_timestamp)
    t = dt.timestamp()
    try:
        return min([(abs(x["time"] - t), x)
                    for x in json_response["hourly"]["data"]])[1]
    except (IndexError, KeyError) as e:
示例#10
0
    # place_id = json_response.get("results", [{}])[0].get("place_id")
    country = geo_get_(json_response, "country")
    address = get_address(json_response)
    return {"city": city, "country": country, "formatted_address": address}


dotenv.load_dotenv("google/.env")
dotenv.load_dotenv(".env")

PYTHON_ENV = os.environ.get("PYTHON_ENV", "dev")
if PYTHON_ENV != "prod":
    KEY = None
else:
    KEY = os.environ.get("GOOGLE_API_KEY", None)

CACHE = get_cache("google_timeline")

DETAILS_URL = "https://maps.googleapis.com/maps/api/place/details/json"
NEARBY_URL = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"

s = requests.Session()
# {'Boating', 'Cycling', 'Driving', 'Flying', 'In transit', 'Moving', 'On a bus', 'On a ferry', 'On a train', 'On a tram', 'On the subway', 'Running', 'Walking'}


def get_results(latlng, name, excluded_transport_names):
    if name in excluded_transport_names:
        return geo_get_info(latlng)
    near_result = get_nearby_results(latlng, name, excluded_transport_names)
    if near_result is None:
        return None
    details = get_details(near_result["place_id"])