def __init__(self): config_file = os.path.join( os.path.dirname(__file__), ".kairos", "config.json") if not config_file.endswith(".json"): raise ValueError(f"Not a json file: {config_file}") config_file = normalize_filepath(config_file) if not isfile(config_file): print(yellow(f"No such file: {config_file}")) reinitializing = input( green("Do you want me to create it? [y/N] ")) if reinitializing and reinitializing.strip( ) and reinitializing.strip()[0].lower() == 'y': makedirs(dirname(config_file), exist_ok=True) with open(config_file, 'w') as f: f.write(json.dumps(app_state)) print(green("Initializing a new Kairos configuration file...")) self.path_to_config = config_file self.is_loaded, self.data, self.err = self.load_appstate() self.available_timestamps = [ ts_name for ts_name, ts_format in self.data['timestamps'].items() ] self.last_used_format = self.available_timestamps[ 0] if self.is_loaded else None
def print_available_timestamps(self): msg = "Available timestamps are: \n\n" left_pad = " + " now = datetime.datetime.now() print(green(msg)) all_ok = True for k in self.available_timestamps: template = self.data['timestamps'][k] ok, rendered, err = self.__render__(template, now) if ok: timestamp = rendered['timestamp'] print( f"""{left_pad}{blue(k)}\n Example: {cyan(timestamp)}\n\n""") else: print(f"{red(err)}") pp(rendered).print() all_ok = False continue return all_ok
def extract_entities_with_allennlp(*s): model_url = "https://storage.googleapis.com/allennlp-public-models/ner-model-2020.02.10.tar.gz" global allennlp_model if not "allennlp_model" in globals() or not globals()["allennlp_model"]: print(yellow("[ model_init ]"), f" :: Loading AllenNLP NER model...") if torch.cuda.is_available(): cuda_device = 0 else: cuda_device = -1 allennlp_model = Predictor.from_path(model_url, cuda_device=cuda_device) print( yellow(f"[ model_init ] "), f" :: CUDA initialized? ", [green("YES"), red("NO")][abs(cuda_device)], ) print(yellow(f"[ model_init ] "), f" :: Load complete.") print(yellow(f"[ model_predict ]"), f" :: Extracting entities...") start = datetime.datetime.now() ents = [] for i, part in enumerate(s): if not part: continue elif len(part) <= 36: part = f"{part} . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ." curr = [] try: # print(yellow(f"[ model_predict ]"), " :: Next input:") # for line in textwrap.wrap(part): # print(" ", cyan(line)) results = allennlp_model.predict(sentence=part) # print(yellow(f"[ model_predict ]"), green(" :: OK")) except Exception as e: print(yellow(f"[ model_predict ]"), red(f" :: {e.__class__.__name__}! :: {e}")) continue for word, tag in zip(results["words"], results["tags"]): if not re.search(r"(LOC)", tag): continue elif word.startswith("'") and curr: curr[-1] += word else: curr.append(word) if tag[0] in "LU": span = " ".join(curr) if len(span) >= 3: ents.append(span) curr = [] finish = datetime.datetime.now() elapsed = (finish - start).total_seconds() mins, secs = elapsed // 60, elapsed % 60 human_readable = ( f"{magenta(str(int(mins)).zfill(2))}m {blue(str(int(secs)).zfill(2))}s" ) # print(json.dumps(results)) print(yellow(f"[ model_predict ]"), f" :: Extraction complete.") print(yellow(f"[ model_predict ]"), f" :: Elapsed time : ", human_readable) print( "======================================================================================================" ) print(green(s)) print( "======================================================================================================" ) cased = defaultdict(list) for ent in ents: cased[ent.lower()].append(ent) for k, v in cased.items(): cased[k] = list(sorted(v, key=lambda x: v.count(x))) freqs = { v[-1]: len(v) for v in sorted(list(cased.values()), key=len, reverse=True) } print(blue("Extracted entities:")) print(cyan(json.dumps(freqs, indent=4))) return freqs
# continue # else: shape = None if prediction in county_shapes: shape = county_shapes[prediction] elif prediction in state_shapes: shape = state_shapes[prediction] elif prediction.startswith("District of Columbia"): states = state_shapes.values() shape = cascaded_union(list(states)).convex_hull print(f"No shape for prediction: {prediction}") # else: # print(f"No shape for prediction: {prediction}") # continue print(green(prediction), blue(shape)) total_points = base_scores[audience] base = approval_score coeff = min(1.0, confidence / 100 + 0.2) adjusted = base * coeff * total_points print(f"{cyan(row['title'])}") print(f"Predicted locale: {magenta(prediction)}") print(f"Predicted audience: {magenta(row['audience'])}") print(f"Original score: {magenta(row['score'])}") print(f"Possible points: {red(total_points)}") print(f"Raw score: {yellow(approval_score)}") print(f"Geoconfidence: {green(confidence / 100)}") print(f"Perplexity penalty: {blue(coeff)}") print(f"Percent of total points awarded: {cyan(base * coeff)}")
except Exception as err: data = {} ok = err is None return ok, data, err def create_timestamp(dt: datetime.datetime, timestamp_format='human_fixedlength') -> str: k = Kairos() ts = k.create_timestamp(dt, timestamp_format) return ts def parse_timestamp(ts: str, timestamp_format=None) -> str: if timestamp_format: k = Kairos() if timestamp_format in k.available_timestamps: template = k.load_template(timestamp_format) parsed = ts.strptime(template) else: parsed = ts.strptime(timestamp_format) else: parsed = parse(ts) return parsed if __name__ == '__main__': k = Kairos() print(k.available_timestamps) ok = k.print_available_timestamps() msg = green("OK") if ok else red("FAIL") now = datetime.datetime.now() ts = create_timestamp(now) print(f"Converted {yellow(now)} to {yellow(ts)}") print(msg)
[row['url'] for row in db.query("select url from articles a;")]) print(f"Loaded {len(list(seen))} seen urls.") rows = { row["url"]: row for row in db.query( f"select * from spiderqueue where lastmod is not null order by lastmod desc limit 5000;" ) if row["url"] not in seen } print(f"Found {len(list(rows))} uncrawled urls...") urls = list(rows.keys())[0:min(LIMIT, len(list(rows.keys())))] random.shuffle(urls) # urls = random.sample(urls, k=len(urls)) print( green( f"[ process_queue ] :: Added {len(list(rows.keys()))} urls to the queue." )) responses = fetch_all_responses(urls, MAX_REQUESTS) for url, res in responses.items(): row = rows[url] row["prediction"] = "rejected" row["mod_status"] = "rejected" is_dumpsterfire = row["is_dumpsterfire"] if isinstance(res, str): row["ok"] = False if is_dumpsterfire: dumpsterfire.upsert(row, ["url"]) print(
async def main(queue, limit=30): crawldb = db["us_metros2"] updates = [] dups = defaultdict(list) seen = set() print(f"Initializing crawler....") async def fetch(name, url, pagetype, parent): seen.add(url) async with httpx.AsyncClient() as client: try: res = await client.get(url, headers=default_headers, timeout=10) except Exception as e: print(e.__class__.__name__, e, url) return print(f"Fetched {url}") dom = fromstring(res.content) if not res.status_code == 200: return stub = { "page": name, "url": url, "type": pagetype, "parent_url": parent, "page_hrefs": [ f"https://en.wikipedia.org{link}" for link in dom.xpath( "//div[contains(@id,'mw-content-text')]//a/@href") ], "page_links": [ link.attrib["title"] for link in dom.xpath( "//div[contains(@id,'mw-content-text')]//a") if link and hasattr(link, "attrib") and "title" in link.attrib ], "response": res.status_code, "ok": res.status_code == 200, "length": len(res.content), "category_hrefs": [ f"https://en.wikipedia.org{link}" for link in dom.xpath( "//div[contains(@id,'catlinks')]//a/@href") ], "category_links": [ link.attrib["title"] for link in dom.xpath("//div[contains(@id,'catlinks')]//a") if link and hasattr(link, "attrib") and "title" in link.attrib ], "latitude": None, "longitude": None, } for colname, sel in xpath_selectors.items(): values = [] if colname in ("latitude", "longitude"): result = [node.text_content() for node in dom.xpath(sel)] if result: result = result[0] stub[colname] = result updates.append(stub) for url in stub["category_hrefs"] + stub["page_hrefs"]: # print(green(url)) if url and url in seen: dups[url].append(name) # print(f"Page {url} is duplicated on: {len(dups[url])} pages") elif url and url not in seen and "Talk" not in url: name = url.split("/wiki/")[-1].replace("_", " ") queue.append((name, url, "link", parent)) queue = deque(queue) while queue: async with trio.open_nursery() as nursery: for i in range(limit): next_url = None while next_url is None: page_name, _next, page_type, parent = queue.popleft() if (_next not in seen and "User" not in _next and "Talk" not in _next): seen.add(_next) next_url = _next nursery.start_soon(fetch, page_name, next_url, page_type, parent) if len(updates) and len(updates) > 500: print(f"Updating database...") crawldb.upsert_many([{k: v for k, v in update.items()} for update in updates], ["url"]) for item in updates: print(green(json.dumps(item, indent=4))) print(f"Inserted {len(updates)} items.") updates = []