def valid_roberta_model(arch): print(">>", arch) model_dir = join(MODELS_DIR, _DATASET_NAME, "holdout_source", arch) save_preds_dir = join(model_dir, f"{_VALID_OR_TEST}_preds") makedirs(save_preds_dir, exist_ok=True) for holdout_source in _DATADEF.domain_names: print(">>>>", holdout_source) save_preds_path = join(model_dir, f"{_VALID_OR_TEST}_preds", f"{holdout_source}.json") if exists(save_preds_path): continue # valid using holdout issue all samples valid_samples = _DATADEF.load_splits_func( [holdout_source], [_LOAD_SPLIT_NAME])[_LOAD_SPLIT_NAME] valid_dataset = RobertaDataset( valid_samples, n_classes=_DATADEF.n_classes, domain_names=_DATADEF.domain_names, source2labelprops=_DATADEF.load_labelprops_func(_LOAD_SPLIT_NAME), ) valid_loader = DataLoader( valid_dataset, batch_size=150, shuffle=True, num_workers=6, ) checkpoint_path = join(model_dir, holdout_source, "checkpoint.pth") model = torch.load(checkpoint_path).to(AUTO_DEVICE) model.eval() id2results = {} with torch.no_grad(): for batch in tqdm(valid_loader): outputs = model(batch) logits = outputs["logits"].detach().cpu().numpy() preds = np.argmax(logits, axis=1) labels = outputs["labels"].detach().cpu().numpy() ids = batch["id"] for id, pred, label in zip(ids, preds, labels): id2results[id] = { "pred": int(pred), "label": int(label), "correct": bool(pred == label), } save_json(id2results, save_preds_path)
def valid_logreg_model(arch): print(">>", arch) config = load_logreg_model_config_all_archs(_DATADEF.n_classes, _DATADEF.n_sources)[arch] model_dir = join(LEXICON_DIR, _DATASET_NAME, "holdout_source", arch) save_preds_dir = join(model_dir, f"{_VALID_OR_TEST}_preds") makedirs(save_preds_dir, exist_ok=True) for holdout_source in _DATADEF.domain_names: print(">>>>", holdout_source) save_preds_path = join(model_dir, f"{_VALID_OR_TEST}_preds", f"{holdout_source}.json") if exists(save_preds_path): continue # valid using holdout issue all samples valid_samples = _DATADEF.load_splits_func( [holdout_source], [_LOAD_SPLIT_NAME])[_LOAD_SPLIT_NAME] model = torch.load(join(model_dir, holdout_source, "model.pth")) batch = build_bow_full_batch( valid_samples, _DATADEF, get_all_tokens(valid_samples), read_txt_as_str_list(join(model_dir, holdout_source, "vocab.txt")), use_source_individual_norm=config["use_source_individual_norm"], labelprop_split=_LOAD_SPLIT_NAME, ) model.eval() with torch.no_grad(): outputs = model(batch) logits = outputs["logits"].detach().cpu().numpy() preds = np.argmax(logits, axis=1) labels = outputs["labels"].detach().cpu().numpy() ids = [s.id for s in valid_samples] id2results = {} for id, pred, label in zip(ids, preds, labels): id2results[id] = { "pred": int(pred), "label": int(label), "correct": bool(pred == label), } save_json(id2results, save_preds_path)
def process_category(p): category_name = basename(p).split("_5")[0] # print(category_name) with gzip.open(p, "r") as g: lines = [l for l in g] n_lines = len(lines) n_samples_to_keep = int(n_lines * _SUBSAMPLE_PROP) samples = {} while len(samples) < n_samples_to_keep: idx = randint(0, n_lines) l = lines[idx] s = json.loads(l) if not all(k in s for k in _KEEP_KEYS): continue sample = {k: s[k] for k in _KEEP_KEYS} sample_id = f"{s['asin']}.{s['reviewerID']}" samples[sample_id] = sample save_json(samples, join(_DST_DATA_DIR, f"{category_name}.json")) all_sample_ids = list(samples.keys()) shuffle(all_sample_ids) n_train = int(n_samples_to_keep * _TRAIN_PROP) n_valid = int(n_samples_to_keep * _VALID_PROP) n_test = n_samples_to_keep - n_train - n_valid save_json(all_sample_ids[:n_train], join(_SPLITS_DIR, f"{category_name}.train.json")) save_json( all_sample_ids[n_train:n_train + n_valid], join(_SPLITS_DIR, f"{category_name}.valid.json"), ) save_json(all_sample_ids[-n_test:], join(_SPLITS_DIR, f"{category_name}.test.json"))
load_all_framing_samples, ) from modapt.dataset.common import calculate_labelprops from modapt.utils import save_json makedirs(_LABELPROPS_DIR, exist_ok=True) # primary frame for split in ["train", "test"]: samples = load_all_framing_samples(ISSUES, split, "primary_frame") source2labelprops = calculate_labelprops(samples, len(PRIMARY_FRAME_NAMES), ISSUES) save_json( { issue: labelprops.tolist() for issue, labelprops in source2labelprops.items() }, join(_LABELPROPS_DIR, f"primary_frame.{split}.json"), ) # primary tone for split in ["train", "test"]: samples = load_all_framing_samples(ISSUES, split, "primary_tone") source2labelprops = calculate_labelprops(samples, len(PRIMARY_TONE_NAMES), ISSUES) save_json( { issue: labelprops.tolist() for issue, labelprops in source2labelprops.items() }, join(_LABELPROPS_DIR, f"primary_tone.{split}.json"),
for holdout_source in _DATADEF.domain_names: print(">>", holdout_source) logdir = join(_SAVE_DIR, holdout_source) makedirs(logdir, exist_ok=True) # valid using holdout issue all samples valid_samples = _DATADEF.load_splits_func([holdout_source], ["train"])["train"] num_correct = 0 for s in tqdm(valid_samples): text = " ".join(get_tokens(s.text)) score = _ANALYZER.polarity_scores(text)["compound"] is_correct = ((score > 0 and s.y_idx == 1) or (score < 0 and s.y_idx == 0) or (score == 0 and RNG.uniform(0, 1) > 0.5 ) # random break tie ) if is_correct: num_correct += 1 acc = num_correct / len(valid_samples) metrics = { "valid_f1": acc, "valid_precision": acc, "valid_recall": acc, } save_json(metrics, join(logdir, "leaf_metrics.json")) reduce_and_save_metrics(dirname(_SAVE_DIR))
_SUBSAMPLE_SIZE = 10000 _POLARITY_TO_LABEL = { "positive": "pos", # "neutral": "pos", # call neutral positive for balance "negative": "neg", } df = pd.read_csv(join(DATA_DIR, "sentiment", "raw", "airline", "Tweets.csv"), ) idxs = RNG.sample(range(len(df)), _SUBSAMPLE_SIZE) dataset_dict = {} for idx in tqdm(idxs): row = df.iloc[idx] text = row[10] polarity = row[1] if polarity not in _POLARITY_TO_LABEL: continue polarity = _POLARITY_TO_LABEL[polarity] tweet_id = row[0] print(tweet_id, polarity, text) new_id = f"airline.{tweet_id}" dataset_dict[new_id] = {"id": new_id, "text": text, "polarity": polarity} save_json(dataset_dict, join(DATA_DIR, "sentiment", "airline.json"))
use_lemmatize = config["use_lemmatize"] metrics = {} # run validation set valid_metrics = eval_lexicon_model( model=model, datadef=_DATADEF, valid_samples=valid_samples, vocab=vocab, use_source_individual_norm=use_source_individual_norm, use_lemmatize=use_lemmatize, labelprop_split="train", ) metrics.update(valid_metrics) save_json(metrics, join(logdir, "leaf_metrics.json")) write_str_list_as_txt(vocab, join(logdir, "vocab.txt")) torch.save(model, join(logdir, "model.pth")) # run test set test_samples = _DATADEF.load_splits_func([holdout_source], ["test"])["test"] test_metrics = eval_lexicon_model( model, _DATADEF, test_samples, vocab, use_source_individual_norm=config["use_source_individual_norm"], use_lemmatize=False, labelprop_split="test", )
RNG = Random() RNG.seed(RANDOM_SEED) _SUBSAMPLE_SIZE = 10000 _POLARITY_TO_LABEL = {"positive": "pos", "negative": "neg"} df = pd.read_csv( join(DATA_DIR, "sentiment", "raw", "imdb", "IMDB Dataset.csv"), ) idxs = RNG.sample(range(len(df)), _SUBSAMPLE_SIZE) dataset_dict = {} for idx in tqdm(idxs): row = df.iloc[idx] text = row[0] polarity = row[1] polarity = _POLARITY_TO_LABEL[polarity] hasher = hashlib.sha1(text.encode()) review_id = base64.urlsafe_b64encode(hasher.digest()[:6]).decode() print(review_id, polarity, text) new_id = f"imdb.{review_id}" dataset_dict[new_id] = {"id": new_id, "text": text, "polarity": polarity} save_json(dataset_dict, join(DATA_DIR, "sentiment", "imdb.json"))
import pandas as pd from config import DATA_DIR, RANDOM_SEED from modapt.utils import ParallelHandler, load_json, save_json from tqdm import tqdm _TRAIN_PROP, _VALID_PROP, _TEST_PROP = [0.8, 0.1, 0.1] _SRC_DATA_DIR = join(DATA_DIR, "sentiment") _SPLITS_DIR = join(_SRC_DATA_DIR, "splits") RNG = Random() RNG.seed(RANDOM_SEED) makedirs(_SPLITS_DIR, exist_ok=True) raw_data_paths = sorted(glob.glob(join(_SRC_DATA_DIR, "*.json"))) for p in raw_data_paths: name = splitext(basename(p))[0] samples = load_json(p) ids = list(samples.keys()) RNG.shuffle(ids) nsample = len(ids) n_train = int(nsample * _TRAIN_PROP) n_valid = int(nsample * _VALID_PROP) n_test = nsample - n_train - n_valid save_json(ids[:n_train], join(_SPLITS_DIR, f"{name}.train.json")) save_json(ids[n_train:n_train + n_valid], join(_SPLITS_DIR, f"{name}.valid.json")) save_json(ids[-n_test:], join(_SPLITS_DIR, f"{name}.test.json"))
# 3: "neg", 4: "pos", 5: "pos", } idxs = set(RNG.sample(range(_TOTAL_SAMPLES), _SUBSAMPLE_SIZE)) samples = [] g = gzip.open(_PATH, "r") for i, l in enumerate(tqdm(g)): if i > _TOTAL_SAMPLES: break if i in idxs: samples.append(eval(l)) dataset_dict = {} for sample in tqdm(samples): text = sample["reviewText"] rating = int(sample["overall"]) if rating not in _RATING_TO_LABEL: continue polarity = _RATING_TO_LABEL[rating] new_id = f"amazon.{sample['asin']}-{sample['reviewerID']}" dataset_dict[new_id] = {"id": new_id, "text": text, "polarity": polarity} save_json(dataset_dict, join(DATA_DIR, "sentiment", "amazon.json"))
model, _DATADEF, train_samples, vocab, use_source_individual_norm, use_lemmatize, labelprop_split="train", ) metrics.update(train_metrics) # run validation set valid_metrics = eval_lexicon_model( model=model, datadef=_DATADEF, valid_samples=valid_samples, vocab=vocab, use_source_individual_norm=use_source_individual_norm, use_lemmatize=use_lemmatize, labelprop_split="train", ) metrics.update(valid_metrics) save_json(metrics, join(trial_logdir, "leaf_metrics.json")) write_str_list_as_txt(vocab, join(trial_logdir, "vocab.txt")) torch.save(model, join(trial_logdir, "model.pth")) save_json(config, join(_SAVE_DIR, "config.json")) reduce_and_save_metrics(dirname(_SAVE_DIR)) reduce_and_save_metrics(dirname(_SAVE_DIR), "leaf_test.json", "mean_test.json")
} # count per years per category cat2years = defaultdict(list) for cat, id2sample in cat2id2sample.items(): for sample in id2sample.values(): cat2years[cat].append(sample["year"]) fig, axs = plt.subplots(nrows=1, ncols=len(cat2years), figsize=(5 * len(cat2years), 5)) for ax, cat in zip(axs, cat2years): ax.hist(cat2years[cat]) ax.set_title(cat) plt.savefig(join(_DST_DATA_DIR, "years.png")) makedirs(_SPLITS_DIR, exist_ok=True) for cat, id2sample in cat2id2sample.items(): save_json(id2sample, join(_DST_DATA_DIR, f"{cat}.json")) all_ids = list(id2sample.keys()) shuffle(all_ids) n_train = int(len(all_ids) * _TRAIN_PROP) n_valid = int(len(all_ids) * _VALID_PROP) n_test = len(all_ids) - n_train - n_valid save_json(all_ids[:n_train], join(_SPLITS_DIR, f"{cat}.train.json")) save_json( all_ids[n_train:n_train + n_valid], join(_SPLITS_DIR, f"{cat}.valid.json"), ) save_json(all_ids[-n_test:], join(_SPLITS_DIR, f"{cat}.test.json"))
reduce_and_save_metrics(_SAVE_DIR) for e in range(_N_TRAIN_EPOCH): reduce_and_save_metrics(_SAVE_DIR, f"leaf_epoch_{e}.json", f"mean_epoch_{e}.json") # setup and run test set for holdout_source in _DATADEF.domain_names: save_metric_path = join(_SAVE_DIR, holdout_source, "leaf_test.json") if exists(save_metric_path): print(">> skip test", holdout_source) continue else: print(">> test", holdout_source) test_samples = _DATADEF.load_splits_func([holdout_source], ["test"])["test"] test_dataset = RobertaDataset( test_samples, n_classes=_DATADEF.n_classes, domain_names=_DATADEF.domain_names, source2labelprops=_DATADEF.load_labelprops_func("test"), ) checkpointpath = join(_SAVE_DIR, holdout_source, "checkpoint.pth") model = torch.load(checkpointpath).to(AUTO_DEVICE) test_metrics = do_valid(model, test_dataset) save_json(test_metrics, save_metric_path) reduce_and_save_metrics(_SAVE_DIR, f"leaf_test.json", f"mean_test.json")
# primary frame trainset: any sample not in testset primary frame, and has non null primary fram trainsets["primary_frame"] = list({ id for id, item in data.items() if (id in ids and id not in testsets["primary_frame"] and item["primary_frame"] != 0 and item["primary_frame"] != None) }) # primary tone trainset: any sample not in testset primary tone, and has none null primary tone trainsets["primary_tone"] = list({ id for id, item in data.items() if (id in ids and id not in testsets["primary_tone"] and item["primary_tone"] != 0 and item["primary_tone"] != None) }) save_json(trainsets, join(FRAMING_DATA_DIR, f"{issue}_train_sets.json")) stat = { "raw": len(data), } stat.update({ f"train_{setname}": len(ids) for setname, ids in trainsets.items() }) stat.update( {f"test_{setname}": len(ids) for setname, ids in testsets.items()}) stats.append(stat) for k, v in stat.items(): print("--", k, v)
for holdout_source in _DATADEF.domain_names: table = holdout_source_to_table[holdout_source] result = mcnemar(table.T) results[holdout_source] = { "pvalue": result.pvalue, "statistic": result.statistic, } all_result = mcnemar(fulltable.T) results["all"] = { "pvalue": all_result.pvalue, "statistic": all_result.statistic, } results["fulltable"] = fulltable.tolist() save_json( results, join(_OUTPUT_SAVE_DIR, f"mcnemars_{_DATASET_NAME}.{_ARCH1}@{_ARCH2}.json")) print("mcnemars p", results["all"]["pvalue"]) # card, power analysis def compute_power(prob_table, dataset_size, alpha=0.05, r=5000): """ Dallas Card et al. "With Little Power Comes Great Responsibility" https://colab.research.google.com/drive/1anaS-9ElouZhUgCAYQt8jy8qBiaXnnK1?usp=sharing#scrollTo=OCz-VAm_ifqZ """ if prob_table[0, 1] == prob_table[1, 0]: raise RuntimeError("Power is undefined when the true effect is zero.") pvals = [] diffs = []
from os import makedirs from os.path import join from experiments.datadef.definitions.amazon import ( CATEGORIES, LABELPROPS_DIR, RATING_N_CLASSES, ) from modapt.dataset.amazon.samples import load_all_amazon_review_samples from modapt.dataset.common import calculate_labelprops from modapt.utils import save_json makedirs(LABELPROPS_DIR, exist_ok=True) for split in ["train", "valid", "test"]: samples = load_all_amazon_review_samples(CATEGORIES, split) source2labelprops = calculate_labelprops(samples, RATING_N_CLASSES, CATEGORIES) save_json( { issue: labelprops.tolist() for issue, labelprops in source2labelprops.items() }, join(LABELPROPS_DIR, f"{split}.json"), )
config, _DATADEF, train_samples=train_samples, valid_samples=valid_samples, vocab_size=config["vocab_size"], logdir=join(savedir, train_source), train_labelprop_split="train", valid_labelprop_split="train", ) model = torch.load(join(savedir, train_source, "model.pth")) vocab = read_txt_as_str_list(join(savedir, train_source, "vocab.txt")) test_samples = _DATADEF.load_splits_func(holdout_sources, ["test"])["test"] test_metrics = eval_lexicon_model( model, _DATADEF, test_samples, vocab, use_lemmatize=False, use_source_individual_norm=config["use_source_individual_norm"], labelprop_split="test", ) save_json(test_metrics, join(savedir, train_source, "leaf_test.json")) save_json(config, join(savedir, "config.json")) reduce_and_save_metrics(_SAVE_ROOT) reduce_and_save_metrics(_SAVE_ROOT, "leaf_test.json", "mean_test.json")
) return metrics["valid_f1"] for ti in range(_N_TRIALS): selected_sample = all_samples[ti * nsample : (ti + 1) * nsample] for label_est_samples, valid_samples in _2fold(selected_sample): acc = _eval_lex_model(label_est_samples, valid_samples) source2type2accs[source]["selected"].append(acc) fullacc = _eval_lex_model(selected_sample, all_samples) source2type2accs[source]["full"].append(fullacc) lexicon_model_perf[str(nsample)] = dict(source2type2accs) save_json(lexicon_model_perf, _LEXICON_MODEL_PERFORMANCE_SAVE_PATH) else: lexicon_model_perf = load_json(_LEXICON_MODEL_PERFORMANCE_SAVE_PATH) lexicon_model_stats = {} for nsample in _LABELPROPS_ESTIMATE_NSAMPLES: accs = [] deltas = [] for source in _DATADEF.domain_names: source_technique_mean = np.array( lexicon_model_perf[str(nsample)][source]["full"] ).mean() source_base_mean = lexicon_model_perf["no_technique"][source] accs.append(source_technique_mean) deltas.append(source_technique_mean - source_base_mean) lexicon_model_stats[str(nsample)] = {