import util import argparse if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--file", default=None, type=str, required=True, help="") args = parser.parse_args() texts = util.read_file(args.file) sampled_texts = util.get_samples(texts, n_samples=20) print("Sampled texts") print("--------------------") for t in sampled_texts: print(t)
def weighted_ml_opt(X_train, oracles, ground_truth, vae_0, weights_type='dbas', LD=20, iters=20, samples=500, homoscedastic=False, homo_y_var=0.1, quantile=0.95, verbose=False, alpha=1, train_gt_evals=None, cutoff=1e-6, it_epochs=10, enc1_units=50): """ Runs weighted maximum likelihood optimization algorithms ('CbAS', 'DbAS', RWR, and CEM-PI) """ assert weights_type in ['cbas', 'dbas', 'rwr', 'cem-pi'] L = X_train.shape[1] vae = util.build_vae(latent_dim=LD, n_tokens=20, seq_length=L, enc1_units=enc1_units) traj = np.zeros((iters, 7)) oracle_samples = np.zeros((iters, samples)) gt_samples = np.zeros((iters, samples)) oracle_max_seq = None oracle_max = -np.inf gt_of_oracle_max = -np.inf y_star = -np.inf for t in range(iters): ### Take Samples ### zt = np.random.randn(samples, LD) if t > 0: Xt_p = vae.decoder_.predict(zt) Xt = util.get_samples(Xt_p) else: Xt = X_train ### Evaluate ground truth and oracle ### yt, yt_var = util.get_balaji_predictions(oracles, Xt) if homoscedastic: yt_var = np.ones_like(yt) * homo_y_var Xt_aa = np.argmax(Xt, axis=-1) if t == 0 and train_gt_evals is not None: yt_gt = train_gt_evals else: yt_gt = ground_truth.predict(Xt_aa, print_every=1000000)[:, 0] ### Calculate weights for different schemes ### if t > 0: if weights_type == 'cbas': log_pxt = np.sum(np.log(Xt_p) * Xt, axis=(1, 2)) X0_p = vae_0.decoder_.predict(zt) log_px0 = np.sum(np.log(X0_p) * Xt, axis=(1, 2)) w1 = np.exp(log_px0 - log_pxt) y_star_1 = np.percentile(yt, quantile * 100) if y_star_1 > y_star: y_star = y_star_1 w2 = scipy.stats.norm.sf(y_star, loc=yt, scale=np.sqrt(yt_var)) weights = w1 * w2 elif weights_type == 'cem-pi': pi = scipy.stats.norm.sf(max_train_gt, loc=yt, scale=np.sqrt(yt_var)) pi_thresh = np.percentile(pi, quantile * 100) weights = (pi > pi_thresh).astype(int) elif weights_type == 'dbas': y_star_1 = np.percentile(yt, quantile * 100) if y_star_1 > y_star: y_star = y_star_1 weights = scipy.stats.norm.sf(y_star, loc=yt, scale=np.sqrt(yt_var)) elif weights_type == 'rwr': weights = np.exp(alpha * yt) weights /= np.sum(weights) else: weights = np.ones(yt.shape[0]) max_train_gt = np.max(yt_gt) yt_max_idx = np.argmax(yt) yt_max = yt[yt_max_idx] if yt_max > oracle_max: oracle_max = yt_max try: oracle_max_seq = util.convert_idx_array_to_aas( Xt_aa[yt_max_idx - 1:yt_max_idx])[0] except IndexError: print(Xt_aa[yt_max_idx - 1:yt_max_idx]) gt_of_oracle_max = yt_gt[yt_max_idx] ### Record and print results ## if t == 0: rand_idx = np.random.randint(0, len(yt), samples) oracle_samples[t, :] = yt[rand_idx] gt_samples[t, :] = yt_gt[rand_idx] if t > 0: oracle_samples[t, :] = yt gt_samples[t, :] = yt_gt traj[t, 0] = np.max(yt_gt) traj[t, 1] = np.mean(yt_gt) traj[t, 2] = np.std(yt_gt) traj[t, 3] = np.max(yt) traj[t, 4] = np.mean(yt) traj[t, 5] = np.std(yt) traj[t, 6] = np.mean(yt_var) if verbose: print(weights_type.upper(), t, traj[t, 0], color.BOLD + str(traj[t, 1]) + color.END, traj[t, 2], traj[t, 3], color.BOLD + str(traj[t, 4]) + color.END, traj[t, 5], traj[t, 6]) ### Train model ### if t == 0: vae.encoder_.set_weights(vae_0.encoder_.get_weights()) vae.decoder_.set_weights(vae_0.decoder_.get_weights()) vae.vae_.set_weights(vae_0.vae_.get_weights()) else: cutoff_idx = np.where(weights < cutoff) Xt = np.delete(Xt, cutoff_idx, axis=0) yt = np.delete(yt, cutoff_idx, axis=0) weights = np.delete(weights, cutoff_idx, axis=0) vae.fit([Xt], [Xt, np.zeros(Xt.shape[0])], epochs=it_epochs, batch_size=10, shuffle=False, sample_weight=[weights, weights], verbose=0) max_dict = { 'oracle_max': oracle_max, 'oracle_max_seq': oracle_max_seq, 'gt_of_oracle_max': gt_of_oracle_max } return traj, oracle_samples, gt_samples, max_dict
import time import setting import util if __name__ == '__main__': # initialize timestamp = time.strftime("%Y%m%d%H%M%S") answer_dict = {} real_image_dict = util.get_image_dict(setting.REAL_IMAGE_DIR) fake_image_list = util.get_image_list(setting.FAKE_IMAGE_DIR)[:5] random.shuffle(fake_image_list) # reset status current_index = 0 current_real_samples = util.get_samples(fake_image_list[current_index], real_image_dict) selected_grid_idx = None fake_gird_idx = random.choice(range(setting.NUM_DISPLAY)) keycode = 255 while keycode != setting.VALID_KEYS['exit']: # grid keys if keycode in setting.GRID_KEYS: selected_grid_idx = keycode - 49 # operation keys if keycode == setting.VALID_KEYS[ 'process'] and selected_grid_idx is not None: answer_dict[current_index] = selected_grid_idx == fake_gird_idx current_index += 1
def compute_liwc(plot_data, plot_data_negation, data, dataset_name, review_category, required_categories, result, class_id, cluster_result, categories, category_reverse, analysis_types): data_filepath = data["data_filepath"] n_samples = None if "n_samples" in data: n_samples = data["n_samples"] reviews = util.read_file(data_filepath) selected_reviews = util.get_samples(reviews, n_samples) all_reviews_data = [] for rev in selected_reviews: doc = nlp(rev) token_count = len(doc) review_data = {} sent_count = 0 for sent in doc.sents: sent_count += 1 negation_pos = 0 negation_neg = 0 for cat in required_categories: review_data[cat] = 0 for idx, token in enumerate(doc): for cat in required_categories: for pattern in cluster_result[category_reverse[cat]]: if (pattern.endswith("*") and token.text.startswith( pattern[:-1])) or (pattern == token.text): review_data[cat] = review_data.get(cat, 0) + 1 if cat == "posemo" and idx > 0: if doc[idx - 1].text in vader_negation_util.NEGATE: negation_pos += 1 elif cat == "negemo" and idx > 0: if doc[idx - 1].text in vader_negation_util.NEGATE: negation_neg += 1 review_data["total_no_of_tokens"] = token_count review_data["total_no_of_sents"] = sent_count review_data["negation_posemo"] = negation_pos review_data["negation_negemo"] = negation_neg all_reviews_data.append(review_data) category_counts = {} for cat in required_categories: category_counts["word_level"] = list( map(lambda x: 1.0 * x[cat] / x["total_no_of_tokens"], all_reviews_data)) category_counts["sent_level"] = list( map(lambda x: 1.0 * x[cat] / x["total_no_of_sents"], all_reviews_data)) category_counts["review_level"] = list( map(lambda x: x[cat], all_reviews_data)) for a_type in analysis_types: plot_data[a_type].append({ # "category": "negative - "+review_category+" review ", "review category": review_category, "liwc_category": cat, "name": dataset_name, "value": np.mean(category_counts[a_type]), "sem_value": stats.sem(category_counts[a_type]), "all_samples_data": category_counts[a_type] }) category_counts = {} for cat in ["negation_posemo", "negation_negemo"]: category_counts["word_level"] = list( map(lambda x: 1.0 * x[cat] / x["total_no_of_tokens"], all_reviews_data)) category_counts["sent_level"] = list( map(lambda x: 1.0 * x[cat] / x["total_no_of_sents"], all_reviews_data)) category_counts["review_level"] = list( map(lambda x: x[cat], all_reviews_data)) for a_type in analysis_types: plot_data_negation[a_type].append({ # "category": "negative - "+review_category+" review ", "review category": review_category, "negation_category": cat, "name": dataset_name, "value": np.mean(category_counts[a_type]), "sem_value": stats.sem(category_counts[a_type]), "all_samples_data": category_counts[a_type] }) return plot_data, plot_data_negation
default="/data/LIWC2007/Dictionaries/LIWC2007_English100131.dic", type=str, help="") parser.add_argument("--n_samples", default=None, type=int, help="") parser.add_argument("--seed_val", default=23, type=int, help="") parser.add_argument("--device_no", default=2, type=int, help="") parser.add_argument("--filter_threshold", default=-1, type=int, help="") args = parser.parse_args() vader_sentiment_scores = vader_negation_util.read_vader_sentiment_dict( VADER_LEXICON_PATH) texts = util.read_file(args.input_file) if args.filter_threshold != -1: texts = util.filter_samples(texts, args.filter_threshold) samples = util.get_samples(texts, args.n_samples, args.seed_val) processed_texts = [] sample_sentences = [] count = 0 neg_words = [] neg_words.extend(NEGATE) if args.liwc: liwc_result, liwc_class_id, liwc_cluster_result, liwc_categories, liwc_category_reverse = liwc_util.load_liwc( args.liwc_filepath) for txt in samples: doc = nlp(txt) output_text = "" for sent in doc.sents: doc_sent = nlp(sent.text)
VADER_LEXICON_PATH) negation_count_data = {} pos_negation_count_data = {} neg_negation_count_data = {} selected_samples = {} for data in datasets: myprint(data) selected_samples[data["name"]] = {} for category in ["positive", "negative"]: texts = util.read_file(data[category]["data_filepath"]) n_samples = None if "n_samples" in data[category]: n_samples = data[category]["n_samples"] selected_texts = util.get_samples(texts, n_samples) selected_samples[data["name"]][category] = selected_texts plot_data = [] plot_data_overall_negation = [] for data in datasets: dep_data = {} for category in ["positive", "negative"]: dep_data[category] = [] negation_count_data = [] pos_negation_count_data = [] neg_negation_count_data = [] selected_texts = selected_samples[data["name"]][category] for text in selected_texts: