def find_keyword_examples(filenames, keywords, n=10, indorgs=None, genders=None, locations=None): """ """ ## Find Matches mp = Pool(NUM_JOBS) helper = partial(get_posts, keywords=keywords, indorgs=indorgs, genders=genders, locations=locations) mp_matches = list( tqdm(mp.imap_unordered(helper, filenames), total=len(filenames), leave=False, file=sys.stdout)) mp.close() ## Sample mp_matches = pd.DataFrame(flatten(mp_matches)) sample = [] for keyword in keywords: mp_keyword_matches = mp_matches.loc[mp_matches["keyword"] == keyword] mp_keyword_matches = mp_keyword_matches.drop_duplicates("text") mp_keyword_matches = mp_keyword_matches.sample(min( n, len(mp_keyword_matches)), random_state=42, replace=False) sample.append(mp_keyword_matches) sample = pd.concat(sample).reset_index(drop=True) return sample
def examine_matches(matches, match_key, match_type, query): """ """ relevant_matches = [] for m in flatten(matches): if match_key not in m["matches"]: continue if match_type not in m["matches"][match_key]: continue m_res = m.get("matches").get(match_key).get(match_type) m_res_present = [(x[0], x[-1]) for x in m_res if x[0] == query] if len(m_res_present) > 0: m_text = m["text"] m_text_highlighted = "" start = 0 end = len(m_text) for _, (term_start, term_end) in m_res_present: m_text_highlighted += m_text[start:term_start] + "<" + m_text[term_start:term_end] + ">" start = term_end if start != end: m_text_highlighted += m_text[start:end] m_copy = deepcopy(m) _ = m_copy.pop("matches",None) m_copy["text"] = m_text_highlighted relevant_matches.append(m_copy) return relevant_matches
def tokenize_and_count(filename, min_n=1, max_n=1, min_date=None, max_date=None, remove_retweets=True, pretokenized=False): """ Args: filename (str): min_n (int) max_n (int) min_date (None or int) max_date (None or int) remove_retweets (bool) pretokenized (bool) Returns: token_counts (Counter): Count of n-grams """ ## Get Ngrams ngrams = load_and_tokenize(filename, min_n=min_n, max_n=max_n, min_date=min_date, max_date=max_date, remove_retweets=remove_retweets, pretokenized=pretokenized, cache_dir=None) ## Count token_counts = Counter(flatten(ngrams)) return token_counts
def plot_marginal_influence(scores_df, vc, vary_cols, metric, aggfunc=np.mean): """ """ ## Get Relevant Data Aggregations if vc not in JOINT_PARAMS.keys(): group_cols = [v for v in vary_cols if v != vc and v not in JOINT_PARAMS.keys()] else: group_cols = [v for v in vary_cols if v != vc and v not in flatten(JOINT_PARAMS.values())] grouped_scores = scores_df.groupby(["domain","group"] + group_cols + [vc])[metric].agg([aggfunc, np.std]) grouped_scores_avg = scores_df.groupby(["domain","group",vc])[metric].agg(bootstrap_ci).to_frame() for i in range(3): grouped_scores_avg[i] = grouped_scores_avg[metric].map(lambda j: j[i]) ## Generate Plot fig, ax = plt.subplots(2, 2, sharex=True, sharey=True) for d, domain in enumerate(["source","target"]): for g, group in enumerate(["train","development"]): pax = ax[d, g] pax_data = grouped_scores.loc[domain, group].reset_index().sort_values(vc) for opt, ind in pax_data.groupby(group_cols).groups.items(): opt_data = pax_data.loc[ind] offset = np.random.normal(0,0.01) pax.errorbar(np.arange(opt_data.shape[0])+offset, opt_data[aggfunc.__name__].values, yerr=opt_data["std"].values if not np.isnan(opt_data["std"].values).all() else None, color="C0", alpha=0.05, zorder=-1) pax.errorbar(np.arange(opt_data.shape[0]), grouped_scores_avg.loc[domain, group][1].values, yerr=np.vstack([(grouped_scores_avg.loc[domain, group][1]-grouped_scores_avg.loc[domain, group][0]).values, (grouped_scores_avg.loc[domain, group][2]-grouped_scores_avg.loc[domain, group][1]).values]), color="black", linewidth=2, zorder=1, capsize=2) pax.set_title(f"{domain.title()} - {group.title()}") pax.spines["right"].set_visible(False) pax.spines["top"].set_visible(False) if pax.get_ylim()[0] < 0: pax.set_ylim(bottom=0) if g == 0: pax.set_ylabel(metric) if d == 1: pax.set_xlabel(f"{vc} Type") pax.xaxis.set_major_locator(MaxNLocator(integer=True)) fig.tight_layout() return fig, ax
def get_unique_terms(matches): """ """ terms = {} for m in flatten(matches): for match_key, match_dict in m.get("matches").items(): if "terms" not in match_dict: continue if match_key not in terms: terms[match_key] = set() for (term, _, _) in match_dict.get("terms"): terms[match_key].add(term) return terms
def tokenize_user_data(self, user_data): """ Tokenize user data into separate sentences Args: user_data (list of str): Unique posts Returns: sentences (list of str): Posts, tokenized into sentences and words """ ## Sentence Tokenization sentences = flatten(list(map(sent_tokenize, user_data))) ## Word Tokenization sentences = list(map(TOKENIZER.tokenize, sentences)) sentences = list(filter(lambda x: len(x) > 0, sentences)) return sentences
def load_tokens(self, filename, n_samples=None): """ Load Tokens (Assume date-based filter already completed) """ ## Load Tokens file_data = [] with gzip.open(filename,"r") as the_file: for line in the_file: file_data.append(json.loads(line)) ## Post-level Sampling if n_samples is not None: file_data = self.loader._select_documents_randomly(file_data, n_samples) ## Flatten Sentences sentences = flatten([i["text"] for i in file_data]) return sentences
####################### ### Reference Keywords ####################### ## Mental Health/Coronavirus Keywords falconet_keywords = {} falconet_keyword_dir = "./data/resources/falconet/" for mhlist, mhfile in [("Crisis (Level 1)", "crisis_level1.keywords"), ("Crisis (Level 2)", "crisis_level2.keywords"), ("Crisis (Level 3)", "crisis_level3.keywords"), ("SMHD", "smhd.keywords"), ("CLSP", "pmi.keywords"), ("Coronavirus", "corona_virus.keywords")]: mhkeys = list( map(lambda i: i.strip(), open(f"{falconet_keyword_dir}{mhfile}", "r").readlines())) mhkeys = sorted(set(flatten([[i, i.lower()] for i in mhkeys]))) mhkeys = flatten([i, f"#{i}"] for i in mhkeys) falconet_keywords[mhlist] = mhkeys ## Reverse Mental Health Keyword List falconet_keywords_reverse = dict() for mhlist, terms in falconet_keywords.items(): for t in terms: if t not in falconet_keywords_reverse: falconet_keywords_reverse[t] = [] falconet_keywords_reverse[t].append(mhlist) ####################### ### Helpers #######################
return X, y def sample_data(X, y, class_ratio=None, sample_size=None, random_seed=42): """ """ ## Rebalance Data X, y = _rebalance(X, y, class_ratio, random_seed) ## Downsample Data X, y = _downsample(X, y, sample_size, random_seed) return X, y ## Helper Function for Converting Count Data term_expansion = lambda x, vocab: flatten([[vocab[i]] * int(x[0, i]) for i in x.nonzero()[1]]) def generate_corpus(Xs, Xt, vocab, source=True, target=True, ys=None, yt=None): """ """ corpus = tp.utils.Corpus() missing = {"source": [], "target": []} for i, x in tqdm(enumerate(Xs), total=Xs.shape[0], desc="Adding Source Documents", file=sys.stdout): if source: x_flat = term_expansion(x, vocab) else:
def find_matches(filename, level=DATE_RES, include_mentions=False): """ """ ## Initialize Sampler sampler = random.Random(SAMPLE_SEED) ## Search For Matches matches = [] timestamps = [] n = 0 n_seen = 0 with gzip.open(filename, "r") as the_file: for post in the_file: n += 1 if sampler.uniform(0, 1) >= SAMPLE_RATE: continue else: n_seen += 1 ## Load Data post_data = json.loads(post) ## Cache Timestamp timestamps.append(pd.to_datetime(post_data["date"])) ## Regex Version post_regex_matches = match_post( post_data, include_mentions=include_mentions) ## Falconet Matches falconet_terms = post_data.get("keywords") ## Continue if None if post_regex_matches is None and falconet_terms is None: continue else: if post_regex_matches: if SKIP_COVID and "covid" in post_regex_matches.get( "matches"): continue regex_terms = [ list(row) for _, row in pd.DataFrame( flatten([ i["terms"] for i in post_regex_matches.get( "matches").values() ])).drop_duplicates(subset=[1, 2]).iterrows() ] regex_terms = filter_substrings(regex_terms) regex_terms = sorted([f[0] for f in regex_terms]) else: regex_terms = [] if not falconet_terms: falconet_terms = [] else: falconet_terms = sorted(falconet_terms) matches.append({ "tweet_id": post_data.get("tweet_id"), "date": pd.to_datetime(post_data.get("date")), "text": post_data.get("text"), "regex_keywords": regex_terms, "falconet_keywords": falconet_terms }) ## Format Timestamps timestamps = format_timestamps(timestamps, level) timestamps = Counter(timestamps) return filename, matches, n, n_seen, timestamps
"covid": { "terms": create_regex_dict(COVID_TERMS["covid"]), "name": "COVID-19" } } ## Find Procesed Files filenames = sorted(glob(f"{DATA_DIR}*.json.gz")) ## Search For Keyword/Subreddit Matches filenames, matches, n, n_seen, timestamps = search_files( filenames, date_res=DATE_RES, include_mentions=False) ## Disagreement disagreements = [ i for i in flatten(matches) if i["regex_keywords"] != i["falconet_keywords"] ] disagreement_rate = len(disagreements) / len(flatten(matches)) * 100 print("Disagreement Rate: {:.3f}%".format(disagreement_rate)) ## Count Comparision regex_counts = pd.Series( Counter(flatten([i["regex_keywords"] for i in flatten(matches)]))).to_frame("regex") falconet_counts = pd.Series( Counter(flatten([i["falconet_keywords"] for i in flatten(matches)]))).to_frame("falconet") merged_counts = pd.concat([regex_counts, falconet_counts], axis=1, sort=True).fillna(0) outliers = np.log((merged_counts["regex"] + 0.01) /
### Load/Parse Matches ################### ## Sample Cache File sample_cache_file = f"{CACHE_DIR}{PLATFORM}_keyword_samples_k-{NUM_SAMPLES_PER_TERM}.json" ## Run Sampling If Necessary if not os.path.exists(sample_cache_file): ## Load Matches match_cache_dir = f"{CACHE_DIR}{PLATFORM}_{START_DATE}_{END_DATE}_matches/" filenames, matches, n, n_seen, timestamps = load_keyword_search_results(match_cache_dir) ## Unique Query Terms unique_terms = get_unique_terms(matches) unique_terms_df = pd.DataFrame(data=sorted(set(flatten(unique_terms.values()))), columns=["term"]) unique_terms_df["keyword_group"] = [[]] * len(unique_terms_df) for term_group, terms in unique_terms.items(): unique_terms_df["keyword_group"] = unique_terms_df.apply(lambda row: [term_group] + row["keyword_group"] if row["term"] in terms else row["keyword_group"], axis=1) ## Get Match Sizes match_sizes = {term:0 for term in unique_terms_df["term"]} for match_set in matches: for post in match_set: for match_key, match_values in post.get("matches").items(): if "terms" not in match_values: continue terms_present = [t[0] for t in match_values.get("terms")] for t in terms_present: if t not in match_sizes:
def main(): """ """ ###################### ### Setup ###################### ## Parse Command Line args = parse_arguments() ## Load Configuration config = Config(filepath=args.config) ## Output if config.output_dir is not None and not os.path.exists(config.output_dir): _ = os.makedirs(config.output_dir) ## Cache Config if config.output_dir is not None and config.run_id is not None: _ = os.system( f"cp {args.config} {config.output_dir}/{config.run_id}.config.json" ) ## Set Random State if config.random_state is not None: np.random.seed(config.random_state) ###################### ### Data Generating Process ###################### ## Generate Data X_latent, X, y, D, theta, phi = data_generating_process( config.N, config.sigma_0, config.p_domain, config.gamma, config.V, config.theta, config.coef, beta=config.beta, random_state=config.random_state) ## Data Distribution Plot if args.make_plots: fig, ax = fit_latent_regression(X_latent, y, D, config.coef) plt.show() ###################### ### Fit Topic Models ###################### ## Split Data into Training and Test train_ind = list(range(int(config.N * .8))) test_ind = list(range(int(config.N * .8), config.N)) ## Generate Corpus train_corpus = tp.utils.Corpus() full_corpus = tp.utils.Corpus() ## Add Training Data for n in range(X.shape[0]): doc_n = doc_to_str(X[n]) full_corpus.add_doc(doc_n, label=[str(D[n])]) if n <= train_ind[-1]: train_corpus.add_doc(doc_n, label=[str(D[n])]) assert len(train_corpus) == len(train_ind) ## Initialize Models (3 Topics Total) lda = tp.LDAModel(k=3, corpus=train_corpus, seed=config.random_state if config.random_state is not None else np.random.randint(1e6)) plda = tp.PLDAModel(latent_topics=1, topics_per_label=1, corpus=train_corpus, seed=config.random_state if config.random_state is not None else np.random.randint(1e6)) ## Initialize Sampler lda.train(1) plda.train(1) ## Update Parameters based on Corpus V_nn = lda.num_vocabs ## MCMC Storage n_iter = max(config.n_iter_lda, config.n_iter_plda) likelihood = np.zeros((n_iter, 2)) * np.nan theta_lda = np.zeros((n_iter, config.N, 3)) * np.nan theta_plda = np.zeros((n_iter, config.N, 3)) * np.nan phi_lda = np.zeros((n_iter, 3, V_nn)) * np.nan phi_plda = np.zeros((n_iter, 3, V_nn)) * np.nan ## Word Count train_word_n = sum([len(d.words) for d in full_corpus[train_ind]]) test_word_n = sum([len(d.words) for d in full_corpus[test_ind]]) ## Train LDA Model for epoch in tqdm(range(config.n_iter_lda), desc="LDA Training"): lda.train(1) train_inf, train_ll = lda.infer(full_corpus[train_ind], iter=config.n_sample) test_inf, test_ll = lda.infer(full_corpus[test_ind], iter=config.n_sample) likelihood[epoch, 0] = train_ll.sum() / train_word_n theta_lda[epoch] = np.vstack( flatten([[d.get_topic_dist() for d in inf] for inf in [train_inf, test_inf]])) phi_lda[epoch] = np.vstack( [lda.get_topic_word_dist(t) for t in range(lda.k)]) ## Train PLDA Model for epoch in tqdm(range(config.n_iter_plda), desc="PLDA Training"): plda.train(1) train_inf, train_ll = plda.infer(full_corpus[train_ind], iter=config.n_sample) test_inf, test_ll = plda.infer(full_corpus[test_ind], iter=config.n_sample) likelihood[epoch, 1] = train_ll.sum() / train_word_n theta_plda[epoch] = np.vstack( flatten([[d.get_topic_dist() for d in inf] for inf in [train_inf, test_inf]])) phi_plda[epoch] = np.vstack( [plda.get_topic_word_dist(t) for t in range(plda.k)]) ## Plot Likelihood if args.make_plots: plt.figure(figsize=(10, 5.8)) plt.plot(likelihood[:, 0], label="LDA") plt.plot(likelihood[:, 1], label="PLDA") plt.xlabel("Training Epoch", fontweight="bold") plt.ylabel("Log Likelihood Per Word", fontweight="bold") plt.legend(loc="lower right") plt.tight_layout() plt.show() ## Plot Traces for Phi if args.make_plots: fig, axes = plt.subplots(phi_lda.shape[1], 2, figsize=(10, 5.8)) for m, (mphi, mdl) in enumerate(zip([phi_lda, phi_plda], ["LDA", "PLDA"])): ax = axes[:, m] for k in range(mphi.shape[1]): ax[k].plot(mphi[:, k, :]) ax[k].set_ylabel("Parameter Value", fontweight="bold") ax[k].spines["top"].set_visible(False) ax[k].spines["right"].set_visible(False) ax[k].set_xlabel("Training Epoch", fontweight="bold") ax[0].set_title(f"{mdl} $\\phi$ Trace", fontweight="bold") fig.tight_layout() plt.show() ## Plot Sample Traces for Theta if args.make_plots: fig, ax = plt.subplots(5, 2, sharex=False, figsize=(10, 5.8)) for d, doc in enumerate( sorted(np.random.choice(config.N, 5, replace=False))): ax[d, 0].plot(theta_lda[:, doc, :]) ax[d, 1].plot(theta_plda[:, doc, :]) for i in range(2): ax[d, i].spines["right"].set_visible(False) ax[d, i].spines["top"].set_visible(False) ax[d, i].set_title(f"Document {doc}", loc="left", fontstyle="italic") ax[d, i].set_ylabel("$\\theta$") for m, mdl in enumerate(["LDA", "PLDA"]): ax[-1, m].set_xlabel(f"{mdl} Training Epoch", fontweight="bold") fig.tight_layout() plt.show() ## Get Final Representations X_latent_lda = np.vstack([ d.get_topic_dist() for d in lda.infer( full_corpus, iter=config.n_sample, together=False)[0] ]) X_latent_plda = np.vstack([ d.get_topic_dist() for d in plda.infer( full_corpus, iter=config.n_sample, together=False)[0] ]) ## Isolate Latent Variables and Normalize X_latent_plda = X_latent_plda[:, -plda.latent_topics:] ## Fit Classifiers source_train_ind = sorted(set(train_ind) & set(np.where(D == 0)[0])) lr_lda = LogisticRegression() lr_lda.fit(X_latent_lda[source_train_ind], y[source_train_ind]) lr_plda = LogisticRegression() lr_plda.fit(X_latent_plda[source_train_ind], y[source_train_ind]) ## Make Test Predictions y_test_lda = lr_lda.predict_proba(X_latent_lda)[:, 1] y_test_plda = lr_plda.predict_proba(X_latent_plda)[:, 1] ## Score Predictions scores = score_model(y, y_test_lda, y_test_plda, D, test_ind, True) if config.output_dir is not None and config.run_id is not None: with open(f"{config.output_dir}/{config.run_id}.scores.json", "w") as the_file: json.dump(scores, the_file)
ax[i, d].set_xlabel(f"Feature {i}", fontweight="bold") ax[i, 0].set_ylabel("Outcome", fontweight="bold") for a in ax: for b in a: b.spines["right"].set_visible(False) b.spines["top"].set_visible(False) b.axvline(0, color="black", alpha=0.5, linestyle="--") for i, t in enumerate(["Source Domain", "Target Domain"]): ax[0, i].set_title(t, fontweight="bold") ax[0, 0].legend(loc="lower right") fig.tight_layout() return fig, ax ## Helper Function doc_to_str = lambda x: flatten([[str(i)] * int(j) for i, j in enumerate(x)]) ## Scoring def score_model(y, y_test_lda, y_test_plda, D, test_ind, verbose=True): """ """ ## Score Cache scores = {"LDA": {}, "PLDA": {}} ## Printing if verbose: print("~~~~~~ Test Set Performance ~~~~~~") ## Cycle through Domain Groups for d, domain in enumerate(["Source", "Target", "Overall"]): ## Domain Indices