def find_keyword_examples(filenames,
                          keywords,
                          n=10,
                          indorgs=None,
                          genders=None,
                          locations=None):
    """

    """
    ## Find Matches
    mp = Pool(NUM_JOBS)
    helper = partial(get_posts,
                     keywords=keywords,
                     indorgs=indorgs,
                     genders=genders,
                     locations=locations)
    mp_matches = list(
        tqdm(mp.imap_unordered(helper, filenames),
             total=len(filenames),
             leave=False,
             file=sys.stdout))
    mp.close()
    ## Sample
    mp_matches = pd.DataFrame(flatten(mp_matches))
    sample = []
    for keyword in keywords:
        mp_keyword_matches = mp_matches.loc[mp_matches["keyword"] == keyword]
        mp_keyword_matches = mp_keyword_matches.drop_duplicates("text")
        mp_keyword_matches = mp_keyword_matches.sample(min(
            n, len(mp_keyword_matches)),
                                                       random_state=42,
                                                       replace=False)
        sample.append(mp_keyword_matches)
    sample = pd.concat(sample).reset_index(drop=True)
    return sample
示例#2
0
def examine_matches(matches,
                    match_key,
                    match_type,
                    query):
    """

    """
    relevant_matches = []
    for m in flatten(matches):
        if match_key not in m["matches"]:
            continue
        if match_type not in m["matches"][match_key]:
            continue
        m_res = m.get("matches").get(match_key).get(match_type)
        m_res_present = [(x[0], x[-1]) for x in m_res if x[0] == query]
        if len(m_res_present) > 0:
            m_text = m["text"]
            m_text_highlighted = ""
            start = 0
            end = len(m_text)
            for _, (term_start, term_end) in m_res_present:
                m_text_highlighted += m_text[start:term_start] + "<" + m_text[term_start:term_end] + ">"
                start = term_end
            if start != end:
                m_text_highlighted += m_text[start:end]
            m_copy = deepcopy(m)
            _ = m_copy.pop("matches",None)
            m_copy["text"] = m_text_highlighted
            relevant_matches.append(m_copy)
    return relevant_matches
示例#3
0
def tokenize_and_count(filename,
                       min_n=1,
                       max_n=1,
                       min_date=None,
                       max_date=None,
                       remove_retweets=True,
                       pretokenized=False):
    """
    Args:
        filename (str):
        min_n (int)
        max_n (int)
        min_date (None or int)
        max_date (None or int)
        remove_retweets (bool)
        pretokenized (bool)
    
    Returns:
        token_counts (Counter): Count of n-grams
    """
    ## Get Ngrams
    ngrams = load_and_tokenize(filename,
                               min_n=min_n,
                               max_n=max_n,
                               min_date=min_date,
                               max_date=max_date,
                               remove_retweets=remove_retweets,
                               pretokenized=pretokenized,
                               cache_dir=None)
    ## Count
    token_counts = Counter(flatten(ngrams))
    return token_counts
def plot_marginal_influence(scores_df,
                            vc,
                            vary_cols,
                            metric,
                            aggfunc=np.mean):
    """

    """
    ## Get Relevant Data Aggregations
    if vc not in JOINT_PARAMS.keys():
        group_cols = [v for v in vary_cols if v != vc and v not in JOINT_PARAMS.keys()]
    else:
        group_cols = [v for v in vary_cols if v != vc and v not in flatten(JOINT_PARAMS.values())]
    grouped_scores = scores_df.groupby(["domain","group"] + group_cols + [vc])[metric].agg([aggfunc, np.std])
    grouped_scores_avg = scores_df.groupby(["domain","group",vc])[metric].agg(bootstrap_ci).to_frame()
    for i in range(3):
        grouped_scores_avg[i] = grouped_scores_avg[metric].map(lambda j: j[i])
    ## Generate Plot
    fig, ax = plt.subplots(2, 2, sharex=True, sharey=True)
    for d, domain in enumerate(["source","target"]):
        for g, group in enumerate(["train","development"]):
            pax = ax[d, g]
            pax_data = grouped_scores.loc[domain, group].reset_index().sort_values(vc)
            for opt, ind in  pax_data.groupby(group_cols).groups.items():
                opt_data = pax_data.loc[ind]
                offset = np.random.normal(0,0.01)
                pax.errorbar(np.arange(opt_data.shape[0])+offset,
                             opt_data[aggfunc.__name__].values,
                             yerr=opt_data["std"].values if not np.isnan(opt_data["std"].values).all() else None,
                             color="C0",
                             alpha=0.05,
                             zorder=-1)
            pax.errorbar(np.arange(opt_data.shape[0]),
                         grouped_scores_avg.loc[domain, group][1].values,
                         yerr=np.vstack([(grouped_scores_avg.loc[domain, group][1]-grouped_scores_avg.loc[domain, group][0]).values,
                                         (grouped_scores_avg.loc[domain, group][2]-grouped_scores_avg.loc[domain, group][1]).values]),
                         color="black",
                         linewidth=2,
                         zorder=1,
                         capsize=2)
            pax.set_title(f"{domain.title()} - {group.title()}")
            pax.spines["right"].set_visible(False)
            pax.spines["top"].set_visible(False)
            if pax.get_ylim()[0] < 0:
                pax.set_ylim(bottom=0)
            if g == 0:
                pax.set_ylabel(metric)
            if d == 1:
                pax.set_xlabel(f"{vc} Type")
            pax.xaxis.set_major_locator(MaxNLocator(integer=True))
    fig.tight_layout()
    return fig, ax
示例#5
0
def get_unique_terms(matches):
    """

    """
    terms = {}
    for m in flatten(matches):
        for match_key, match_dict in m.get("matches").items():
            if "terms" not in match_dict:
                continue
            if match_key not in terms:
                terms[match_key] = set()
            for (term, _, _) in match_dict.get("terms"):
                terms[match_key].add(term)
    return terms
示例#6
0
    def tokenize_user_data(self,
                           user_data):
        """
        Tokenize user data into separate sentences

        Args:
            user_data (list of str): Unique posts
        
        Returns:
            sentences (list of str): Posts, tokenized into sentences and words
        """
        ## Sentence Tokenization
        sentences = flatten(list(map(sent_tokenize, user_data)))
        ## Word Tokenization
        sentences = list(map(TOKENIZER.tokenize, sentences))
        sentences = list(filter(lambda x: len(x) > 0, sentences))
        return sentences
示例#7
0
 def load_tokens(self,
                 filename,
                 n_samples=None):
     """
     Load Tokens (Assume date-based filter already completed)
     """
     ## Load Tokens
     file_data = []
     with gzip.open(filename,"r") as the_file:
         for line in the_file:
             file_data.append(json.loads(line))
     ## Post-level Sampling
     if n_samples is not None:
         file_data = self.loader._select_documents_randomly(file_data,
                                                            n_samples)
     ## Flatten Sentences
     sentences = flatten([i["text"] for i in file_data])
     return sentences
#######################
### Reference Keywords
#######################

## Mental Health/Coronavirus Keywords
falconet_keywords = {}
falconet_keyword_dir = "./data/resources/falconet/"
for mhlist, mhfile in [("Crisis (Level 1)", "crisis_level1.keywords"),
                       ("Crisis (Level 2)", "crisis_level2.keywords"),
                       ("Crisis (Level 3)", "crisis_level3.keywords"),
                       ("SMHD", "smhd.keywords"), ("CLSP", "pmi.keywords"),
                       ("Coronavirus", "corona_virus.keywords")]:
    mhkeys = list(
        map(lambda i: i.strip(),
            open(f"{falconet_keyword_dir}{mhfile}", "r").readlines()))
    mhkeys = sorted(set(flatten([[i, i.lower()] for i in mhkeys])))
    mhkeys = flatten([i, f"#{i}"] for i in mhkeys)
    falconet_keywords[mhlist] = mhkeys

## Reverse Mental Health Keyword List
falconet_keywords_reverse = dict()
for mhlist, terms in falconet_keywords.items():
    for t in terms:
        if t not in falconet_keywords_reverse:
            falconet_keywords_reverse[t] = []
        falconet_keywords_reverse[t].append(mhlist)

#######################
### Helpers
#######################
    return X, y


def sample_data(X, y, class_ratio=None, sample_size=None, random_seed=42):
    """

    """
    ## Rebalance Data
    X, y = _rebalance(X, y, class_ratio, random_seed)
    ## Downsample Data
    X, y = _downsample(X, y, sample_size, random_seed)
    return X, y


## Helper Function for Converting Count Data
term_expansion = lambda x, vocab: flatten([[vocab[i]] * int(x[0, i])
                                           for i in x.nonzero()[1]])


def generate_corpus(Xs, Xt, vocab, source=True, target=True, ys=None, yt=None):
    """

    """
    corpus = tp.utils.Corpus()
    missing = {"source": [], "target": []}
    for i, x in tqdm(enumerate(Xs),
                     total=Xs.shape[0],
                     desc="Adding Source Documents",
                     file=sys.stdout):
        if source:
            x_flat = term_expansion(x, vocab)
        else:
def find_matches(filename, level=DATE_RES, include_mentions=False):
    """

    """
    ## Initialize Sampler
    sampler = random.Random(SAMPLE_SEED)
    ## Search For Matches
    matches = []
    timestamps = []
    n = 0
    n_seen = 0
    with gzip.open(filename, "r") as the_file:
        for post in the_file:
            n += 1
            if sampler.uniform(0, 1) >= SAMPLE_RATE:
                continue
            else:
                n_seen += 1
                ## Load Data
                post_data = json.loads(post)
                ## Cache Timestamp
                timestamps.append(pd.to_datetime(post_data["date"]))
                ## Regex Version
                post_regex_matches = match_post(
                    post_data, include_mentions=include_mentions)
                ## Falconet Matches
                falconet_terms = post_data.get("keywords")
                ## Continue if None
                if post_regex_matches is None and falconet_terms is None:
                    continue
                else:
                    if post_regex_matches:
                        if SKIP_COVID and "covid" in post_regex_matches.get(
                                "matches"):
                            continue
                        regex_terms = [
                            list(row) for _, row in pd.DataFrame(
                                flatten([
                                    i["terms"] for i in post_regex_matches.get(
                                        "matches").values()
                                ])).drop_duplicates(subset=[1, 2]).iterrows()
                        ]
                        regex_terms = filter_substrings(regex_terms)
                        regex_terms = sorted([f[0] for f in regex_terms])
                    else:
                        regex_terms = []
                    if not falconet_terms:
                        falconet_terms = []
                    else:
                        falconet_terms = sorted(falconet_terms)
                    matches.append({
                        "tweet_id":
                        post_data.get("tweet_id"),
                        "date":
                        pd.to_datetime(post_data.get("date")),
                        "text":
                        post_data.get("text"),
                        "regex_keywords":
                        regex_terms,
                        "falconet_keywords":
                        falconet_terms
                    })
    ## Format Timestamps
    timestamps = format_timestamps(timestamps, level)
    timestamps = Counter(timestamps)
    return filename, matches, n, n_seen, timestamps
    "covid": {
        "terms": create_regex_dict(COVID_TERMS["covid"]),
        "name": "COVID-19"
    }
}

## Find Procesed Files
filenames = sorted(glob(f"{DATA_DIR}*.json.gz"))

## Search For Keyword/Subreddit Matches
filenames, matches, n, n_seen, timestamps = search_files(
    filenames, date_res=DATE_RES, include_mentions=False)

## Disagreement
disagreements = [
    i for i in flatten(matches)
    if i["regex_keywords"] != i["falconet_keywords"]
]
disagreement_rate = len(disagreements) / len(flatten(matches)) * 100
print("Disagreement Rate: {:.3f}%".format(disagreement_rate))

## Count Comparision
regex_counts = pd.Series(
    Counter(flatten([i["regex_keywords"]
                     for i in flatten(matches)]))).to_frame("regex")
falconet_counts = pd.Series(
    Counter(flatten([i["falconet_keywords"]
                     for i in flatten(matches)]))).to_frame("falconet")
merged_counts = pd.concat([regex_counts, falconet_counts], axis=1,
                          sort=True).fillna(0)
outliers = np.log((merged_counts["regex"] + 0.01) /
示例#12
0
### Load/Parse Matches
###################

## Sample Cache File
sample_cache_file = f"{CACHE_DIR}{PLATFORM}_keyword_samples_k-{NUM_SAMPLES_PER_TERM}.json"

## Run Sampling If Necessary
if not os.path.exists(sample_cache_file):

    ## Load Matches
    match_cache_dir = f"{CACHE_DIR}{PLATFORM}_{START_DATE}_{END_DATE}_matches/"
    filenames, matches, n, n_seen, timestamps = load_keyword_search_results(match_cache_dir)

    ## Unique Query Terms
    unique_terms = get_unique_terms(matches)
    unique_terms_df = pd.DataFrame(data=sorted(set(flatten(unique_terms.values()))),
                                   columns=["term"])
    unique_terms_df["keyword_group"] = [[]] * len(unique_terms_df)
    for term_group, terms in unique_terms.items():
        unique_terms_df["keyword_group"] = unique_terms_df.apply(lambda row: [term_group] + row["keyword_group"] if row["term"] in terms else row["keyword_group"], axis=1)
        
    ## Get Match Sizes
    match_sizes = {term:0 for term in unique_terms_df["term"]}
    for match_set in matches:
        for post in match_set:
            for match_key, match_values in post.get("matches").items():
                if "terms" not in match_values:
                    continue
                terms_present = [t[0] for t in match_values.get("terms")]
                for t in terms_present:
                    if t not in match_sizes:
def main():
    """

    """
    ######################
    ### Setup
    ######################
    ## Parse Command Line
    args = parse_arguments()
    ## Load Configuration
    config = Config(filepath=args.config)
    ## Output
    if config.output_dir is not None and not os.path.exists(config.output_dir):
        _ = os.makedirs(config.output_dir)
    ## Cache Config
    if config.output_dir is not None and config.run_id is not None:
        _ = os.system(
            f"cp {args.config} {config.output_dir}/{config.run_id}.config.json"
        )
    ## Set Random State
    if config.random_state is not None:
        np.random.seed(config.random_state)
    ######################
    ### Data Generating Process
    ######################
    ## Generate Data
    X_latent, X, y, D, theta, phi = data_generating_process(
        config.N,
        config.sigma_0,
        config.p_domain,
        config.gamma,
        config.V,
        config.theta,
        config.coef,
        beta=config.beta,
        random_state=config.random_state)
    ## Data Distribution Plot
    if args.make_plots:
        fig, ax = fit_latent_regression(X_latent, y, D, config.coef)
        plt.show()
    ######################
    ### Fit Topic Models
    ######################
    ## Split Data into Training and Test
    train_ind = list(range(int(config.N * .8)))
    test_ind = list(range(int(config.N * .8), config.N))
    ## Generate Corpus
    train_corpus = tp.utils.Corpus()
    full_corpus = tp.utils.Corpus()
    ## Add Training Data
    for n in range(X.shape[0]):
        doc_n = doc_to_str(X[n])
        full_corpus.add_doc(doc_n, label=[str(D[n])])
        if n <= train_ind[-1]:
            train_corpus.add_doc(doc_n, label=[str(D[n])])
    assert len(train_corpus) == len(train_ind)
    ## Initialize Models (3 Topics Total)
    lda = tp.LDAModel(k=3,
                      corpus=train_corpus,
                      seed=config.random_state if config.random_state
                      is not None else np.random.randint(1e6))
    plda = tp.PLDAModel(latent_topics=1,
                        topics_per_label=1,
                        corpus=train_corpus,
                        seed=config.random_state if config.random_state
                        is not None else np.random.randint(1e6))
    ## Initialize Sampler
    lda.train(1)
    plda.train(1)
    ## Update Parameters based on Corpus
    V_nn = lda.num_vocabs
    ## MCMC Storage
    n_iter = max(config.n_iter_lda, config.n_iter_plda)
    likelihood = np.zeros((n_iter, 2)) * np.nan
    theta_lda = np.zeros((n_iter, config.N, 3)) * np.nan
    theta_plda = np.zeros((n_iter, config.N, 3)) * np.nan
    phi_lda = np.zeros((n_iter, 3, V_nn)) * np.nan
    phi_plda = np.zeros((n_iter, 3, V_nn)) * np.nan
    ## Word Count
    train_word_n = sum([len(d.words) for d in full_corpus[train_ind]])
    test_word_n = sum([len(d.words) for d in full_corpus[test_ind]])
    ## Train LDA Model
    for epoch in tqdm(range(config.n_iter_lda), desc="LDA Training"):
        lda.train(1)
        train_inf, train_ll = lda.infer(full_corpus[train_ind],
                                        iter=config.n_sample)
        test_inf, test_ll = lda.infer(full_corpus[test_ind],
                                      iter=config.n_sample)
        likelihood[epoch, 0] = train_ll.sum() / train_word_n
        theta_lda[epoch] = np.vstack(
            flatten([[d.get_topic_dist() for d in inf]
                     for inf in [train_inf, test_inf]]))
        phi_lda[epoch] = np.vstack(
            [lda.get_topic_word_dist(t) for t in range(lda.k)])
    ## Train PLDA Model
    for epoch in tqdm(range(config.n_iter_plda), desc="PLDA Training"):
        plda.train(1)
        train_inf, train_ll = plda.infer(full_corpus[train_ind],
                                         iter=config.n_sample)
        test_inf, test_ll = plda.infer(full_corpus[test_ind],
                                       iter=config.n_sample)
        likelihood[epoch, 1] = train_ll.sum() / train_word_n
        theta_plda[epoch] = np.vstack(
            flatten([[d.get_topic_dist() for d in inf]
                     for inf in [train_inf, test_inf]]))
        phi_plda[epoch] = np.vstack(
            [plda.get_topic_word_dist(t) for t in range(plda.k)])
    ## Plot Likelihood
    if args.make_plots:
        plt.figure(figsize=(10, 5.8))
        plt.plot(likelihood[:, 0], label="LDA")
        plt.plot(likelihood[:, 1], label="PLDA")
        plt.xlabel("Training Epoch", fontweight="bold")
        plt.ylabel("Log Likelihood Per Word", fontweight="bold")
        plt.legend(loc="lower right")
        plt.tight_layout()
        plt.show()
    ## Plot Traces for Phi
    if args.make_plots:
        fig, axes = plt.subplots(phi_lda.shape[1], 2, figsize=(10, 5.8))
        for m, (mphi,
                mdl) in enumerate(zip([phi_lda, phi_plda], ["LDA", "PLDA"])):
            ax = axes[:, m]
            for k in range(mphi.shape[1]):
                ax[k].plot(mphi[:, k, :])
                ax[k].set_ylabel("Parameter Value", fontweight="bold")
                ax[k].spines["top"].set_visible(False)
                ax[k].spines["right"].set_visible(False)
            ax[k].set_xlabel("Training Epoch", fontweight="bold")
            ax[0].set_title(f"{mdl} $\\phi$ Trace", fontweight="bold")
        fig.tight_layout()
        plt.show()
    ## Plot Sample Traces for Theta
    if args.make_plots:
        fig, ax = plt.subplots(5, 2, sharex=False, figsize=(10, 5.8))
        for d, doc in enumerate(
                sorted(np.random.choice(config.N, 5, replace=False))):
            ax[d, 0].plot(theta_lda[:, doc, :])
            ax[d, 1].plot(theta_plda[:, doc, :])
            for i in range(2):
                ax[d, i].spines["right"].set_visible(False)
                ax[d, i].spines["top"].set_visible(False)
                ax[d, i].set_title(f"Document {doc}",
                                   loc="left",
                                   fontstyle="italic")
                ax[d, i].set_ylabel("$\\theta$")
        for m, mdl in enumerate(["LDA", "PLDA"]):
            ax[-1, m].set_xlabel(f"{mdl} Training Epoch", fontweight="bold")
        fig.tight_layout()
        plt.show()
    ## Get Final Representations
    X_latent_lda = np.vstack([
        d.get_topic_dist() for d in lda.infer(
            full_corpus, iter=config.n_sample, together=False)[0]
    ])
    X_latent_plda = np.vstack([
        d.get_topic_dist() for d in plda.infer(
            full_corpus, iter=config.n_sample, together=False)[0]
    ])
    ## Isolate Latent Variables and Normalize
    X_latent_plda = X_latent_plda[:, -plda.latent_topics:]
    ## Fit Classifiers
    source_train_ind = sorted(set(train_ind) & set(np.where(D == 0)[0]))
    lr_lda = LogisticRegression()
    lr_lda.fit(X_latent_lda[source_train_ind], y[source_train_ind])
    lr_plda = LogisticRegression()
    lr_plda.fit(X_latent_plda[source_train_ind], y[source_train_ind])
    ## Make Test Predictions
    y_test_lda = lr_lda.predict_proba(X_latent_lda)[:, 1]
    y_test_plda = lr_plda.predict_proba(X_latent_plda)[:, 1]
    ## Score Predictions
    scores = score_model(y, y_test_lda, y_test_plda, D, test_ind, True)
    if config.output_dir is not None and config.run_id is not None:
        with open(f"{config.output_dir}/{config.run_id}.scores.json",
                  "w") as the_file:
            json.dump(scores, the_file)
            ax[i, d].set_xlabel(f"Feature {i}", fontweight="bold")
            ax[i, 0].set_ylabel("Outcome", fontweight="bold")
    for a in ax:
        for b in a:
            b.spines["right"].set_visible(False)
            b.spines["top"].set_visible(False)
            b.axvline(0, color="black", alpha=0.5, linestyle="--")
    for i, t in enumerate(["Source Domain", "Target Domain"]):
        ax[0, i].set_title(t, fontweight="bold")
    ax[0, 0].legend(loc="lower right")
    fig.tight_layout()
    return fig, ax


## Helper Function
doc_to_str = lambda x: flatten([[str(i)] * int(j) for i, j in enumerate(x)])


## Scoring
def score_model(y, y_test_lda, y_test_plda, D, test_ind, verbose=True):
    """

    """
    ## Score Cache
    scores = {"LDA": {}, "PLDA": {}}
    ## Printing
    if verbose:
        print("~~~~~~ Test Set Performance ~~~~~~")
    ## Cycle through Domain Groups
    for d, domain in enumerate(["Source", "Target", "Overall"]):
        ## Domain Indices