def get_normalized_ident(self, rtype, identifier): identifier_int = PostIdentifier(identifier.blog_name, int(identifier.id_)) identifier_str = PostIdentifier(identifier.blog_name, str(identifier.id_)) if identifier_int in self.cache[rtype]: self.cache["last_accessed_time"][identifier_int] = now_pst() return identifier_int if identifier_str in self.cache[rtype]: self.cache["last_accessed_time"][identifier_str] = now_pst() return identifier_str return None
def compute_dynamic_mood_at_time( mood_inputs: pd.DataFrame, time: datetime = None, window_length_days: float = WINDOW_LENGTH_DAYS, # pass None for unbounded system: DynamicMoodSystem = None, apply_daily_offset: bool = True, ) -> float: if system is None: system = DynamicMoodSystem() if time is None: time = now_pst() start_time = None if window_length_days is not None: start_time = time - pd.Timedelta(days=window_length_days) lti_series = compute_dynamic_mood_over_interval( mood_inputs=mood_inputs, start_time=start_time, end_time=time, system=system, apply_daily_offset=apply_daily_offset, ) time_indexable = pd.Timestamp(time).round(f"{system.step_sec}s") return lti_series.loc[time_indexable]
def remove_oldest(self, max_hours=18, dryrun=False): lat = self.cache["last_accessed_time"] existing_p = self.cache[CachedResponseType.POSTS] existing_n = self.cache[CachedResponseType.NOTES] existing_dpj = self.cache["dash_post_judgments"] last_allowed_time = now_pst() - timedelta(hours=max_hours) allowed_p = {pi for pi, t in lat.items() if t >= last_allowed_time} new_p = {pi: existing_p[pi] for pi in existing_p if pi in allowed_p} new_n = {pi: existing_n[pi] for pi in existing_n if pi in allowed_p} new_lat = {pi: lat[pi] for pi in lat if pi in allowed_p} new_dpj = { pi: existing_dpj[pi] for pi in existing_dpj if pi in allowed_p } before_len_p = len(existing_p) before_len_n = len(existing_n) before_len_lat = len(lat) before_len_dpj = len(existing_dpj) delta_len_p = before_len_p - len(new_p) delta_len_n = before_len_n - len(new_n) delta_len_lat = before_len_lat - len(new_lat) delta_len_dpj = before_len_dpj - len(new_dpj) if dryrun: print( f"remove_oldest: would drop {delta_len_p} of {before_len_p} POSTS" ) print( f"remove_oldest: would drop {delta_len_n} of {before_len_n} NOTES" ) print( f"remove_oldest: would drop {delta_len_lat} of {before_len_lat} last_accessed_time" ) print( f"remove_oldest: would drop {delta_len_dpj} of {before_len_dpj} dash_post_judgments" ) else: print( f"remove_oldest: dropping {delta_len_p} of {before_len_p} POSTS" ) print( f"remove_oldest: dropping {delta_len_n} of {before_len_n} NOTES" ) print( f"remove_oldest: dropping {delta_len_lat} of {before_len_lat} last_accessed_time" ) print( f"remove_oldest: dropping {delta_len_dpj} of {before_len_dpj} dash_post_judgments" ) self.cache[CachedResponseType.POSTS] = new_p self.cache[CachedResponseType.NOTES] = new_n self.cache["last_accessed_time"] = new_lat self.cache["dash_post_judgments"] = new_dpj
def compute_rate_over_last_hours(post_payloads, avg_over_hours, now=None): if now is None: now = now_pst() delt = timedelta(hours=avg_over_hours) ts = now - delt n = count_posts_since_ts(post_payloads, ts) rate = n / delt.total_seconds() return n, rate
def construct_prob_delta_prompts(thread: TumblrThread, needs_empty_reblog=True): if needs_empty_reblog: thread = add_empty_reblog(thread, 'DUMMYUSER', now_pst()) prompt = npf_thread_to_formatted_text(thread, prob_delta_format=True) prompt_ref = prompt.splitlines()[-1] _, posts = expand_asks(thread) forbidden_strings = [" " + post.blog_name for post in posts[:-1]] return prompt, prompt_ref, forbidden_strings
def get_retention_stack_judgments( retention_stack, blog_name="nostalgebraist-autoresponder", # TODO (cleanup): improve timestamp=None): from api_ml.ml_connector import ( selection_proba_from_gpt, sentiment_logit_diffs_from_gpt, autoreview_proba_from_gpt, ) from tumblr_to_text.nwo_munging import make_nwo_textpost_prompts if timestamp is None: timestamp = now_pst() if len(retention_stack) == 0: proba, logit_diffs, autoreview_proba = [], [], [] return proba, logit_diffs, autoreview_proba base_texts = sorted(retention_stack) prompts, prompts_selector, prompts_autoreviewer, _ = make_nwo_textpost_prompts( blog_name=blog_name, timestamp=now_pst()) selector_texts = [prompts_selector[prompts[0]] + c for c in base_texts] sentiment_texts = base_texts autoreviewer_texts = [ prompts_autoreviewer[prompts[0]] + c for c in base_texts ] proba = selection_proba_from_gpt(selector_texts) proba = do_all_coldstarts(base_texts, proba) logit_diffs = sentiment_logit_diffs_from_gpt(sentiment_texts) autoreview_proba = autoreview_proba_from_gpt(autoreviewer_texts, ) return proba, logit_diffs, autoreview_proba
def review_rates(post_payloads, max_per_24h=250, hour_windows=( 1, 2, 4, 12, ), now=None, max_rate=None): if not max_rate: max_rate = compute_max_rate_until_next_reset(post_payloads, now=now, max_per_24h=max_per_24h) if now is None: now = now_pst() reset_ts = post_limit_reset_ts(now=now) is_since_reset = [False for _ in hour_windows] hour_windows += (((now - reset_ts).total_seconds() / 3600), ) is_since_reset.append(True) ns = [] rates = [] for h in hour_windows: n, rate = compute_rate_over_last_hours(post_payloads, avg_over_hours=h, now=now) ns.append(n) rates.append(rate) for h, n, r, isr, in zip(hour_windows, ns, rates, is_since_reset): ratio = r / max_rate pieces = [ f"last {float(h):<4.1f} hours:", f"{n:<3} posts", f"{ratio:<6.1%} of max rate" ] if isr: pieces = ["[since reset]"] + pieces else: pieces = [""] + pieces msg = "".join([f"{piece:<20}\t" for piece in pieces]) print(msg)
def post_limit_reset_ts(now=None): # this assumes: # - tumblr resets at midnight EST # - frank is running in PST # TODO: revisit this if i'm on vacation or something if now is None: now = now_pst() one_day_ago = now - timedelta(days=1) reset_date = now.date() if now.hour >= 21 else one_day_ago.date() reset_ts = datetime.combine(reset_date, dtime(hour=21)) return reset_ts
def compute_max_rate_until_next_reset(post_payloads, now=None, max_per_24h=250): if now is None: now = now_pst() reset_ts = post_limit_reset_ts(now=now) posts_since_last_reset = count_posts_since_ts(post_payloads, ts=reset_ts) n_remaining = max_per_24h - posts_since_last_reset next_reset_ts = reset_ts + timedelta(days=1) time_until_next_reset = next_reset_ts - now seconds_until_next_reset = time_until_next_reset.total_seconds() max_rate = n_remaining / seconds_until_next_reset return max_rate
def fetch_and_process(blog_name: str = bot_name, n: Optional[int] = None, offset : int = 0, include_unused_types=False, fetch_only=False, process_only=False): with open("data/head_training_data_raw_posts.pkl.gz", "rb") as f: posts = pickle.load(f) max_ts_posix = max(pp["timestamp"] for pp in posts) max_ts = fromtimestamp_pst(max_ts_posix).isoformat() print(f"loaded {len(posts)} raw posts, max ts {max_ts}") lines = load() max_processed_id = max(line["id"] for line in lines) print(f"loaded {len(lines)} existing records, max id {max_processed_id}") if process_only: new_posts = [pp for pp in posts if pp["id"] > max_processed_id] else: pool = ClientPool() new_posts = fetch_posts(pool, blog_name, n, offset, needs_private_client=True, stop_at_id=max_processed_id) posts.extend(new_posts) print(f"saving {len(posts)} raw posts") with open("data/head_training_data_raw_posts.pkl.gz", "wb") as f: pickle.dump(posts, f) if fetch_only: return lines base_head_timestamp = now_pst() lines_new = [post_to_line_entry(pp, base_head_timestamp, blog_name=blog_name, include_unused_types=include_unused_types) for pp in tqdm(new_posts, mininterval=0.3, smoothing=0)] lines.extend(lines_new) return lines
def on_post_creation_callback(self, api_response: dict, bridge_response: dict): t1 = time.time() entry = {"api__" + k: v for k, v in api_response.items()} entry.update(bridge_response) entry['timestamp_manual'] = now_pst().timestamp() for k in sorted(entry.keys()): if k not in self.logs["fields"]: print( f"on_post_creation_callback: adding field named {repr(k)}") self.logs = _add_field(self.logs, k) self.logs["data"].append(entry) self.save() t2 = time.time() print(f"on_post_creation_callback: took {t2-t1:.3f}s sec")
_Wherever possible_, it's better to modify posts as NPF and convert to text at the very end. This file exists for the specials cases where that's not possible. TODO: one day, realize the dream of a fully invertible tumblr -> text -> tumblr converter... """ import re from datetime import datetime from tumblr_to_text.classic.autoresponder_static import DEFAULT_CSC, find_control_chars_forumlike from tumblr_to_text.classic.autoresponder_static_v8 import ( timestamp_to_v10_format, format_segment_v8_interlocutors) from util.times import now_pst now = now_pst() # ensures same value in long-running jobs orig_poster_regex = DEFAULT_CSC["ORIG_POST_CHAR_NAMED"].format( user_name="([^ ]*)") def get_ccs_with_fixes(doc): extra_names = doc.split(" ")[1:2] if extra_names[0] == 'nostalgebraist-autoresponder': extra_names = [] ccs = find_control_chars_forumlike(doc, extra_names=extra_names) # edge case if ccs[0][0].startswith("#1 nostalgebraist-autoresponder posted"): if ccs[1][0].startswith(" nostalgebraist-autoresponder posted"): ccs.pop(1)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--dryrun", action="store_true") parser.add_argument("--hot-only", action="store_true") args = parser.parse_args() base_head_timestamp = now_pst() # trace print("loading trace logs") if args.hot_only: import persistence.traceability_singleton trace_logs = persistence.traceability_singleton.TRACE_LOGS.logs["data"] else: trace_logs = traceability.load_full_traceability_logs()["data"] print(f"loaded trace logs: {len(trace_logs)} rows") trace_logs = [ row for row in trace_logs if row.get("requested__state") in {"draft", "queue"} ] print(f"subsetted trace logs to draft/queue: {len(trace_logs)} rows") required_keys = [ "api__id", "prompt_autoreviewer", "choice_ix", "all_continuations", "timestamp_manual", "post_type", "state_reasons" ] keycounts = Counter() key_nonnull_counts = Counter() for row in trace_logs: for k in required_keys: keycounts[k] += (k in row) key_nonnull_counts[k] += (row.get(k) is not None) print(f"keycounts: {keycounts}\nkey_nonnull_counts: {key_nonnull_counts}") trace_logs = [ row for row in trace_logs if all( row.get(k) is not None for k in required_keys) ] print(f"subsetted trace logs to nwo / usable: {len(trace_logs)} rows") # don't let the model learn from its own use of the rts tag trace_logs = [ row for row in trace_logs if not row['state_reasons'].get('ml_rejected') ] print( f"removed model-rejected drafts from trace logs: {len(trace_logs)} rows" ) pool = ClientPool() current_queue = [ pp['id'] for pp in pool.get_private_client().queue( 'nostalgebraist-autoresponder')['posts'] ] trace_logs = [ row for row in trace_logs if row.get("api__id") not in current_queue ] print(f"removed currently queued posts: {len(trace_logs)} rows") trace_indices_to_texts = {} for i, row in enumerate(trace_logs): actual_timestamp = fromtimestamp_pst(row["timestamp_manual"]) subbed = sub_prompt_timestamp(base_head_timestamp, actual_timestamp, row["prompt_autoreviewer"]) trace_indices_to_texts[i] = subbed + row["all_continuations"][ row["choice_ix"]] trace_map = defaultdict(list) for i, row in enumerate(trace_logs): trace_map[row["api__id"]].append(i) # pub print("loading pub logs") with open("data/head_training_data.json", "r", encoding="utf-8") as f: pub_logs = json.load(f) print(f"loaded pub logs: {len(pub_logs)} rows") for row in pub_logs: gid = row["genesis_post_id"] row["genesis_or_published_id"] = gid if gid is not None else row["id"] pub_map = defaultdict(list) for i, row in enumerate(pub_logs): pub_map[row["genesis_or_published_id"]].append(i) # match print("matching...") trace_indices_to_targets = {} trace_indices_to_published_ids = {} n_accept = 0 n_reject = 0 n_skip = 0 n_multimatch = 0 iter_ = tqdm(trace_map.items(), total=len(trace_map), mininterval=1, smoothing=0) for api__id, group_trace_indices in iter_: pub_gids_matching_trace_id = pub_map.get(api__id, []) if len(pub_gids_matching_trace_id) == 0: # never published for trace_index in group_trace_indices: trace_indices_to_targets[trace_index] = "reject" trace_indices_to_published_ids[trace_index] = None n_reject += len(group_trace_indices) else: if len(pub_gids_matching_trace_id) > 1: # ??? n_multimatch += 1 matching_pub_row = pub_logs[pub_gids_matching_trace_id[0]] # assumes trace is ordered by time -- i believe this is true pubd_ix = group_trace_indices[-1] if trace_logs[pubd_ix]['requested__state'] != 'queue': trace_indices_to_targets[pubd_ix] = "accept" trace_indices_to_published_ids[pubd_ix] = matching_pub_row[ "id"] n_accept += 1 else: # queued posts i don't delete aren't signal trace_indices_to_targets[pubd_ix] = "skip" n_skip += 1 for trace_index in group_trace_indices[:-1]: trace_indices_to_targets[trace_index] = "reject" trace_indices_to_published_ids[trace_index] = None n_reject += len(group_trace_indices) - 1 iter_.set_postfix(n_accept=n_accept, n_reject=n_reject, n_skip=n_skip, zz_n_multimatch=n_multimatch) # verify n_accept_verify = sum(v == "accept" for v in trace_indices_to_targets.values()) n_reject_verify = sum(v == "reject" for v in trace_indices_to_targets.values()) n_skip_verify = sum(v == "skip" for v in trace_indices_to_targets.values()) print(f"\nn_accept: {n_accept_verify} vs {n_accept}") print(f"n_reject: {n_reject_verify} vs {n_reject}") print(f"n_skip: {n_skip_verify} vs {n_skip}") autoreview_train_data = [] for ix in sorted(trace_indices_to_targets.keys()): if trace_indices_to_targets[ix] == 'skip': continue autoreview_train_data.append({ "text": trace_indices_to_texts[ix], "target": trace_indices_to_targets[ix], "trace_api__id": trace_logs[ix]["api__id"], "pub_api__id": trace_indices_to_published_ids[ix], "post_type": trace_logs[ix]["post_type"] }) if not args.dryrun: with open("data/autoreview_train_data.json", "w", encoding="utf-8") as f: json.dump(autoreview_train_data, f, indent=1)
def counterfactual_mood_graph( mood_inputs, determiner_centers, determiner_multipliers=None, n_days=1, start_time: datetime = None, end_time: datetime = None, window_length_days: float = WINDOW_LENGTH_DAYS, in_logit_diff_space: bool = True, pairs_only: bool = False, include_milestones: bool = True, system_kwargs=None ) -> str: ytrans = pos_sent_to_logit_diff if in_logit_diff_space else lambda x: x if end_time is None: end_time = now_pst() if start_time is None: start_time = end_time - pd.Timedelta(days=n_days) systems = {"actual": DynamicMoodSystem()} if system_kwargs is not None: systems[repr(system_kwargs)] = DynamicMoodSystem(**system_kwargs) left_time = start_time - pd.Timedelta(days=window_length_days) for dc in determiner_centers: if dc is not None: new_dc_updates = { k: v for k, v in DETERMINER_CENTER_UPDATES.items() if k < left_time } new_dc_updates[left_time] = dc if not pairs_only: systems[f"dc={dc:.2f}"] = DynamicMoodSystem( determiner_center_updates=new_dc_updates ) if determiner_multipliers is not None: for dm in determiner_multipliers: new_dm_updates = { k: v for k, v in DETERMINER_MULTIPLIER_UPDATES.items() if k < left_time } new_dm_updates[left_time] = dm dm_s = f"{dm*RESPONSE_SCALE_BASE:.3f}x" if dc is not None: systems[f"dc={dc:.2f}, dm={dm_s}"] = DynamicMoodSystem( determiner_center_updates=new_dc_updates, determiner_multiplier_updates=new_dm_updates, ) if f"dm={dm_s}" not in systems: if not pairs_only: systems[f"dm={dm_s}"] = DynamicMoodSystem( determiner_multiplier_updates=new_dm_updates ) lti_serieses = {} for name, system in tqdm(systems.items()): mood_inputs = system.set_centered_scaled_determiner(mood_inputs) lti_series = compute_dynamic_mood_over_interval( mood_inputs, left_time, end_time, system ).apply(ytrans) lti_series = lti_series.loc[start_time:end_time] lti_serieses[name] = lti_series plt.figure(figsize=(8, 6)) tops, bottoms = [], [] for name, lti_series in lti_serieses.items(): tops.append(lti_series.max()) bottoms.append(lti_series.min()) ls = "-" if "dc=" in name and "dm=" in name: ls = "-." elif "dm=" in name: ls = "--" alpha = 1 if "actual" in name else 0.667 plt.plot(lti_series.index, lti_series.values, label=name, ls=ls, alpha=alpha) print(tops) print(bottoms) colors = { "only_happy": "#000080", "only_non_sad": "#8888FF", "only_non_happy": "#FF6666", "only_sad": "#800000", } for k in ["only_happy", "only_non_sad", "only_non_happy", "only_sad"]: plt.axhline( ytrans(MOOD_NAME_TO_DYNAMIC_MOOD_VALUE_MAP[k]), ls="--", c=colors[k], ) golives = {} if include_milestones: golives.update(MILESTONE_TIMES) golives.update( {ts: f"dc -> {dc:.2f}" for ts, dc in DETERMINER_CENTER_UPDATES.items()} ) golives.update( { ts: f"dm -> {dm*RESPONSE_SCALE_BASE:.3f}x" for ts, dm in DETERMINER_MULTIPLIER_UPDATES.items() } ) for golive, name in golives.items(): if golive > start_time: c = "r" if name.startswith("v") else ("y" if name.startswith("dc") else "k") plt.axvline(golive, c=c, ls="-.", label=name) if in_logit_diff_space: default_top = ( pos_sent_to_logit_diff(MOOD_NAME_TO_DYNAMIC_MOOD_VALUE_MAP["only_happy"]) + 1.5 ) default_bottom = ( pos_sent_to_logit_diff(MOOD_NAME_TO_DYNAMIC_MOOD_VALUE_MAP["only_sad"]) - 1.5 ) plt.ylim(min(default_bottom, min(bottoms) - 1.5), max(default_top, max(tops) + 1.5)) plt.legend( fontsize=12, ) plt.tick_params(labelsize=12) plt.tick_params(axis="x", labelrotation=60) plt.show()
def create_mood_graph( response_cache: ResponseCache, start_time: datetime = None, end_time: datetime = None, window_length_days: float = WINDOW_LENGTH_DAYS, system: DynamicMoodSystem = None, in_logit_diff_space: bool = True, font: str = "Menlo", save_image: bool = True, show_image: bool = False, ) -> str: ytrans = pos_sent_to_logit_diff if in_logit_diff_space else lambda x: x mood_inputs = compute_dynamic_mood_inputs(response_cache) lti_series = compute_dynamic_mood_over_interval( mood_inputs, start_time - pd.Timedelta(days=window_length_days), end_time, system, ).apply(ytrans) lti_series = lti_series.loc[start_time:end_time] fig = plt.figure(figsize=(8, 6)) plt.plot(lti_series.index, lti_series.values, label="Mood", c="k") colors = { "only_happy": "#000080", "only_non_sad": "#8888FF", "only_non_happy": "#FF6666", "only_sad": "#800000", } display_names = { "only_sad": ":(", "only_non_happy": ":|", "only_non_sad": ":)", "only_happy": ":D", } for k in ["only_happy", "only_non_sad", "only_non_happy", "only_sad"]: plt.axhline( ytrans(MOOD_NAME_TO_DYNAMIC_MOOD_VALUE_MAP[k]), label=display_names[k], ls="--", c=colors[k], zorder=1.9, ) if in_logit_diff_space: default_top = ( pos_sent_to_logit_diff(MOOD_NAME_TO_DYNAMIC_MOOD_VALUE_MAP["only_happy"]) + 1.5 ) default_bottom = ( pos_sent_to_logit_diff(MOOD_NAME_TO_DYNAMIC_MOOD_VALUE_MAP["only_sad"]) - 1.5 ) plt.ylim( min(default_bottom, lti_series.min() - 1.5), max(default_top, lti_series.max() + 1.5) ) plt.legend( fontsize=16, ) plt.tick_params(labelsize=16) plt.tick_params(axis="x", labelrotation=80) plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%-I %p")) plt.tick_params(axis="x", labelrotation=70) plt.grid(axis="x") ax = plt.gca() for t in ax.get_xticklabels(): t.set_fontname(font) for t in ax.get_yticklabels(): t.set_fontname(font) for t in ax.legend_.texts: t.set_fontname(font) if save_image: image_name = now_pst().strftime("%Y-%m-%d-%H-%M-%S") + ".png" path = MOOD_IMAGE_DIR + image_name plt.savefig(path, bbox_inches="tight") plt.close(fig) return path if show_image: plt.show() else: plt.close(fig)
def compute_dynamic_mood_over_interval( mood_inputs: pd.DataFrame, start_time: datetime = None, end_time: datetime = None, system: DynamicMoodSystem = None, apply_daily_offset: bool = True, forcing_system=False, ) -> pd.Series: if start_time is None: start_time = mood_inputs.index[0] if end_time is None: end_time = now_pst() if system is None: system = DynamicMoodSystem() # sentiment_centered = determiner - system.determiner_center_series(determiner) # sentiment_centered = ( # system.determiner_multiplier_series(sentiment_centered) * sentiment_centered # ) sentiment_centered = mood_inputs["scaled_determiner"] if start_time > sentiment_centered.index.max(): sentiment_centered.loc[start_time] = 0. sentiment_centered = sentiment_centered.loc[start_time:end_time] sentiment_centered_indexed = sentiment_centered.resample( f"{system.step_sec}s" ).sum() extra_ts_ix = pd.date_range( sentiment_centered_indexed.index[-1] + pd.Timedelta(seconds=system.step_sec), end_time + pd.Timedelta(seconds=system.step_sec), freq=f"{system.step_sec}s", ) extra_ts = pd.Series(np.zeros(len(extra_ts_ix)), index=extra_ts_ix) sentiment_centered_indexed_extended = pd.concat( [sentiment_centered_indexed, extra_ts] ).sort_index() start_ts = sentiment_centered_indexed_extended.index[0] t = ( sentiment_centered_indexed_extended.index - start_ts ).total_seconds().values / system.step_sec u = sentiment_centered_indexed_extended.values tout, y, x = lsim( system.lti_system if not forcing_system else system.forcing_system, u, t, interp=False, ) lti_series = pd.Series(y, index=sentiment_centered_indexed_extended.index) if apply_daily_offset and not forcing_system: lti_series = apply_daily_mood_offset(lti_series) if not forcing_system: lti_series = lti_series.apply(logit_diff_to_pos_sent) return lti_series