import util
import argparse

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--file",
                        default=None,
                        type=str,
                        required=True,
                        help="")

    args = parser.parse_args()

    texts = util.read_file(args.file)
    sampled_texts = util.get_samples(texts, n_samples=20)

    print("Sampled texts")
    print("--------------------")
    for t in sampled_texts:
        print(t)
示例#2
0
def weighted_ml_opt(X_train,
                    oracles,
                    ground_truth,
                    vae_0,
                    weights_type='dbas',
                    LD=20,
                    iters=20,
                    samples=500,
                    homoscedastic=False,
                    homo_y_var=0.1,
                    quantile=0.95,
                    verbose=False,
                    alpha=1,
                    train_gt_evals=None,
                    cutoff=1e-6,
                    it_epochs=10,
                    enc1_units=50):
    """
    Runs weighted maximum likelihood optimization algorithms ('CbAS', 'DbAS',
    RWR, and CEM-PI)
    """

    assert weights_type in ['cbas', 'dbas', 'rwr', 'cem-pi']
    L = X_train.shape[1]
    vae = util.build_vae(latent_dim=LD,
                         n_tokens=20,
                         seq_length=L,
                         enc1_units=enc1_units)

    traj = np.zeros((iters, 7))
    oracle_samples = np.zeros((iters, samples))
    gt_samples = np.zeros((iters, samples))
    oracle_max_seq = None
    oracle_max = -np.inf
    gt_of_oracle_max = -np.inf
    y_star = -np.inf

    for t in range(iters):
        ### Take Samples ###
        zt = np.random.randn(samples, LD)
        if t > 0:
            Xt_p = vae.decoder_.predict(zt)
            Xt = util.get_samples(Xt_p)
        else:
            Xt = X_train

        ### Evaluate ground truth and oracle ###
        yt, yt_var = util.get_balaji_predictions(oracles, Xt)
        if homoscedastic:
            yt_var = np.ones_like(yt) * homo_y_var
        Xt_aa = np.argmax(Xt, axis=-1)
        if t == 0 and train_gt_evals is not None:
            yt_gt = train_gt_evals
        else:
            yt_gt = ground_truth.predict(Xt_aa, print_every=1000000)[:, 0]

        ### Calculate weights for different schemes ###
        if t > 0:
            if weights_type == 'cbas':
                log_pxt = np.sum(np.log(Xt_p) * Xt, axis=(1, 2))
                X0_p = vae_0.decoder_.predict(zt)
                log_px0 = np.sum(np.log(X0_p) * Xt, axis=(1, 2))
                w1 = np.exp(log_px0 - log_pxt)
                y_star_1 = np.percentile(yt, quantile * 100)
                if y_star_1 > y_star:
                    y_star = y_star_1
                w2 = scipy.stats.norm.sf(y_star, loc=yt, scale=np.sqrt(yt_var))
                weights = w1 * w2
            elif weights_type == 'cem-pi':
                pi = scipy.stats.norm.sf(max_train_gt,
                                         loc=yt,
                                         scale=np.sqrt(yt_var))
                pi_thresh = np.percentile(pi, quantile * 100)
                weights = (pi > pi_thresh).astype(int)
            elif weights_type == 'dbas':
                y_star_1 = np.percentile(yt, quantile * 100)
                if y_star_1 > y_star:
                    y_star = y_star_1
                weights = scipy.stats.norm.sf(y_star,
                                              loc=yt,
                                              scale=np.sqrt(yt_var))
            elif weights_type == 'rwr':
                weights = np.exp(alpha * yt)
                weights /= np.sum(weights)
        else:
            weights = np.ones(yt.shape[0])
            max_train_gt = np.max(yt_gt)

        yt_max_idx = np.argmax(yt)
        yt_max = yt[yt_max_idx]
        if yt_max > oracle_max:
            oracle_max = yt_max
            try:
                oracle_max_seq = util.convert_idx_array_to_aas(
                    Xt_aa[yt_max_idx - 1:yt_max_idx])[0]
            except IndexError:
                print(Xt_aa[yt_max_idx - 1:yt_max_idx])
            gt_of_oracle_max = yt_gt[yt_max_idx]

        ### Record and print results ##
        if t == 0:
            rand_idx = np.random.randint(0, len(yt), samples)
            oracle_samples[t, :] = yt[rand_idx]
            gt_samples[t, :] = yt_gt[rand_idx]
        if t > 0:
            oracle_samples[t, :] = yt
            gt_samples[t, :] = yt_gt

        traj[t, 0] = np.max(yt_gt)
        traj[t, 1] = np.mean(yt_gt)
        traj[t, 2] = np.std(yt_gt)
        traj[t, 3] = np.max(yt)
        traj[t, 4] = np.mean(yt)
        traj[t, 5] = np.std(yt)
        traj[t, 6] = np.mean(yt_var)

        if verbose:
            print(weights_type.upper(), t, traj[t, 0],
                  color.BOLD + str(traj[t, 1]) + color.END, traj[t, 2],
                  traj[t, 3], color.BOLD + str(traj[t, 4]) + color.END,
                  traj[t, 5], traj[t, 6])

        ### Train model ###
        if t == 0:
            vae.encoder_.set_weights(vae_0.encoder_.get_weights())
            vae.decoder_.set_weights(vae_0.decoder_.get_weights())
            vae.vae_.set_weights(vae_0.vae_.get_weights())
        else:
            cutoff_idx = np.where(weights < cutoff)
            Xt = np.delete(Xt, cutoff_idx, axis=0)
            yt = np.delete(yt, cutoff_idx, axis=0)
            weights = np.delete(weights, cutoff_idx, axis=0)
            vae.fit([Xt], [Xt, np.zeros(Xt.shape[0])],
                    epochs=it_epochs,
                    batch_size=10,
                    shuffle=False,
                    sample_weight=[weights, weights],
                    verbose=0)

    max_dict = {
        'oracle_max': oracle_max,
        'oracle_max_seq': oracle_max_seq,
        'gt_of_oracle_max': gt_of_oracle_max
    }
    return traj, oracle_samples, gt_samples, max_dict
示例#3
0
import time

import setting
import util

if __name__ == '__main__':
    # initialize
    timestamp = time.strftime("%Y%m%d%H%M%S")
    answer_dict = {}
    real_image_dict = util.get_image_dict(setting.REAL_IMAGE_DIR)
    fake_image_list = util.get_image_list(setting.FAKE_IMAGE_DIR)[:5]
    random.shuffle(fake_image_list)

    # reset status
    current_index = 0
    current_real_samples = util.get_samples(fake_image_list[current_index],
                                            real_image_dict)
    selected_grid_idx = None
    fake_gird_idx = random.choice(range(setting.NUM_DISPLAY))
    keycode = 255

    while keycode != setting.VALID_KEYS['exit']:

        # grid keys
        if keycode in setting.GRID_KEYS:
            selected_grid_idx = keycode - 49

        # operation keys
        if keycode == setting.VALID_KEYS[
                'process'] and selected_grid_idx is not None:
            answer_dict[current_index] = selected_grid_idx == fake_gird_idx
            current_index += 1
示例#4
0
def compute_liwc(plot_data, plot_data_negation, data, dataset_name,
                 review_category, required_categories, result, class_id,
                 cluster_result, categories, category_reverse, analysis_types):

    data_filepath = data["data_filepath"]

    n_samples = None
    if "n_samples" in data:
        n_samples = data["n_samples"]

    reviews = util.read_file(data_filepath)
    selected_reviews = util.get_samples(reviews, n_samples)
    all_reviews_data = []
    for rev in selected_reviews:
        doc = nlp(rev)
        token_count = len(doc)
        review_data = {}
        sent_count = 0
        for sent in doc.sents:
            sent_count += 1
        negation_pos = 0
        negation_neg = 0
        for cat in required_categories:
            review_data[cat] = 0

        for idx, token in enumerate(doc):
            for cat in required_categories:
                for pattern in cluster_result[category_reverse[cat]]:
                    if (pattern.endswith("*") and token.text.startswith(
                            pattern[:-1])) or (pattern == token.text):
                        review_data[cat] = review_data.get(cat, 0) + 1
                        if cat == "posemo" and idx > 0:
                            if doc[idx - 1].text in vader_negation_util.NEGATE:
                                negation_pos += 1
                        elif cat == "negemo" and idx > 0:
                            if doc[idx - 1].text in vader_negation_util.NEGATE:
                                negation_neg += 1

        review_data["total_no_of_tokens"] = token_count
        review_data["total_no_of_sents"] = sent_count
        review_data["negation_posemo"] = negation_pos
        review_data["negation_negemo"] = negation_neg

        all_reviews_data.append(review_data)

    category_counts = {}
    for cat in required_categories:
        category_counts["word_level"] = list(
            map(lambda x: 1.0 * x[cat] / x["total_no_of_tokens"],
                all_reviews_data))
        category_counts["sent_level"] = list(
            map(lambda x: 1.0 * x[cat] / x["total_no_of_sents"],
                all_reviews_data))
        category_counts["review_level"] = list(
            map(lambda x: x[cat], all_reviews_data))

        for a_type in analysis_types:
            plot_data[a_type].append({
                # "category": "negative - "+review_category+" review ",
                "review category":
                review_category,
                "liwc_category":
                cat,
                "name":
                dataset_name,
                "value":
                np.mean(category_counts[a_type]),
                "sem_value":
                stats.sem(category_counts[a_type]),
                "all_samples_data":
                category_counts[a_type]
            })

    category_counts = {}
    for cat in ["negation_posemo", "negation_negemo"]:
        category_counts["word_level"] = list(
            map(lambda x: 1.0 * x[cat] / x["total_no_of_tokens"],
                all_reviews_data))
        category_counts["sent_level"] = list(
            map(lambda x: 1.0 * x[cat] / x["total_no_of_sents"],
                all_reviews_data))
        category_counts["review_level"] = list(
            map(lambda x: x[cat], all_reviews_data))

        for a_type in analysis_types:
            plot_data_negation[a_type].append({
                # "category": "negative - "+review_category+" review ",
                "review category":
                review_category,
                "negation_category":
                cat,
                "name":
                dataset_name,
                "value":
                np.mean(category_counts[a_type]),
                "sem_value":
                stats.sem(category_counts[a_type]),
                "all_samples_data":
                category_counts[a_type]
            })

    return plot_data, plot_data_negation
        default="/data/LIWC2007/Dictionaries/LIWC2007_English100131.dic",
        type=str,
        help="")
    parser.add_argument("--n_samples", default=None, type=int, help="")
    parser.add_argument("--seed_val", default=23, type=int, help="")
    parser.add_argument("--device_no", default=2, type=int, help="")
    parser.add_argument("--filter_threshold", default=-1, type=int, help="")

    args = parser.parse_args()
    vader_sentiment_scores = vader_negation_util.read_vader_sentiment_dict(
        VADER_LEXICON_PATH)
    texts = util.read_file(args.input_file)
    if args.filter_threshold != -1:
        texts = util.filter_samples(texts, args.filter_threshold)

    samples = util.get_samples(texts, args.n_samples, args.seed_val)
    processed_texts = []
    sample_sentences = []

    count = 0
    neg_words = []
    neg_words.extend(NEGATE)
    if args.liwc:
        liwc_result, liwc_class_id, liwc_cluster_result, liwc_categories, liwc_category_reverse = liwc_util.load_liwc(
            args.liwc_filepath)

    for txt in samples:
        doc = nlp(txt)
        output_text = ""
        for sent in doc.sents:
            doc_sent = nlp(sent.text)
        VADER_LEXICON_PATH)

    negation_count_data = {}
    pos_negation_count_data = {}
    neg_negation_count_data = {}

    selected_samples = {}
    for data in datasets:
        myprint(data)
        selected_samples[data["name"]] = {}
        for category in ["positive", "negative"]:
            texts = util.read_file(data[category]["data_filepath"])
            n_samples = None
            if "n_samples" in data[category]:
                n_samples = data[category]["n_samples"]
            selected_texts = util.get_samples(texts, n_samples)
            selected_samples[data["name"]][category] = selected_texts

    plot_data = []
    plot_data_overall_negation = []

    for data in datasets:
        dep_data = {}
        for category in ["positive", "negative"]:
            dep_data[category] = []
            negation_count_data = []
            pos_negation_count_data = []
            neg_negation_count_data = []

            selected_texts = selected_samples[data["name"]][category]
            for text in selected_texts: