def load_labelprops(split): if split == "valid": split = "train" # kfold valid and train are the same set return { issue: np.array(labelprops) for issue, labelprops in load_json( join(_LABELPROPS_DIR, f"{split}.json")).items() }
def load_all_arxiv_abstract_samples(categories: List[str], split: str) -> List[DataSample]: assert split in ["train", "valid", "test"] samples = [] for c in tqdm(categories): ids = load_json(join(DATA_DIR, "arxiv", "splits", f"{c}.{split}.json")) raw_data = load_json(join(DATA_DIR, "arxiv", f"{c}.json")) for id in ids: samples.append( DataSample( id=id, text=raw_data[id]["abstract"], y_idx=year2yidx(raw_data[id]["year"]), domain_name=c, domain_idx=ARXIV_CATEGORY2IDX[c], )) return samples
def load_all_amazon_review_samples(categories: List[str], split: str) -> List[DataSample]: assert split in ["train", "valid", "test"] samples = [] for c in tqdm(categories): ids = load_json( join(DATA_DIR, "amazon_subsampled", "splits", f"{c}.{split}.json")) raw_data = load_json(join(DATA_DIR, "amazon_subsampled", f"{c}.json")) for id in ids: samples.append( DataSample( id=id, text=raw_data[id]["reviewText"], # rating=raw_data[id]["overall"], y_idx=rating_to_ridx(raw_data[id]["overall"]), domain_name=c, domain_idx=CATEGORY2CIDX[c], )) return samples
def load_metrics(model_root): nsample2acc = {} nsample2accs = {} metrics = load_json(join(model_root, "mean_metrics.json")) for nsample in ROBERTA_ADAPT_N_SAMPLES: accs = [] for source in _DATADEF.domain_names: accs.append( metrics[f"{nsample:04}_samples"][source]["mean"]["valid_f1.best"] ) nsample2accs[nsample] = np.array(accs) nsample2acc[nsample] = np.array(accs).mean() return nsample2acc, nsample2accs
def load_all_framing_samples(issues: List[str], split: str, task: str) -> List[DataSample]: assert split in ["train", "test"] samples = [] for issue in tqdm(issues): ids = load_json( join(DATA_DIR, "framing_labeled", f"{issue}_{split}_sets.json"))[task] raw_data = load_json( join(DATA_DIR, "framing_labeled", f"{issue}_labeled.json")) for id in ids: samples.append( DataSample( id=id, text=remove_framing_text_headings(raw_data[id]["text"]), y_idx=code_to_yidx(raw_data[id][task], task), domain_name=issue, domain_idx=ISSUE2IIDX[issue], )) return samples
def load_kfold_framing_samples(issues: List[str], task: str) -> List[Dict[str, List[DataSample]]]: kidx2split2samples = [{"train": [], "valid": []} for _ in range(KFOLD)] samples = load_all_framing_samples(issues, split="train", task=task) for issue in tqdm(issues): kfold_data = load_json( join(DATA_DIR, "framing_labeled", f"{KFOLD}fold", f"{issue}.json")) for kidx, fold in enumerate(kfold_data[task]): for split in ["train", "valid"]: ids = set(fold[split]) selected_samples = [s for s in samples if s.id in ids] kidx2split2samples[kidx][split].extend(selected_samples) return kidx2split2samples
accs.append( metrics[f"{nsample:04}_samples"][source]["mean"]["valid_f1.best"] ) nsample2accs[nsample] = np.array(accs) nsample2acc[nsample] = np.array(accs).mean() return nsample2acc, nsample2accs holdout_adapt_nsample2acc, holdout_adapt_nsample2accs = load_metrics( _HOLDOUT_ADAPT_MODEL_ROOT ) from_scratch_nsample2acc, from_scratch_nsample2accs = load_metrics( _FROM_SCRATCH_MODEL_ROOT ) holdout_source_metrics = load_json(join(_HOLDOUT_SOUCE_MODEL_ROOT, "mean_metrics.json")) holdout_source_acc = np.array( [ holdout_source_metrics[source]["mean"]["valid_f1.best"] for source in _DATADEF.domain_names ] ).mean() _PLOT_SAVE_PATH = join(_SAVE_DIR, f"{_ROBERTA_ARCH}.png") plt.clf() plt.figure(figsize=(7, 5)) plt.plot( ROBERTA_ADAPT_N_SAMPLES, [holdout_adapt_nsample2acc[nsample] for nsample in ROBERTA_ADAPT_N_SAMPLES], marker="D",
import pandas as pd from config import DATA_DIR, RANDOM_SEED from modapt.utils import ParallelHandler, load_json, save_json from tqdm import tqdm _TRAIN_PROP, _VALID_PROP, _TEST_PROP = [0.8, 0.1, 0.1] _SRC_DATA_DIR = join(DATA_DIR, "sentiment") _SPLITS_DIR = join(_SRC_DATA_DIR, "splits") RNG = Random() RNG.seed(RANDOM_SEED) makedirs(_SPLITS_DIR, exist_ok=True) raw_data_paths = sorted(glob.glob(join(_SRC_DATA_DIR, "*.json"))) for p in raw_data_paths: name = splitext(basename(p))[0] samples = load_json(p) ids = list(samples.keys()) RNG.shuffle(ids) nsample = len(ids) n_train = int(nsample * _TRAIN_PROP) n_valid = int(nsample * _VALID_PROP) n_test = nsample - n_train - n_valid save_json(ids[:n_train], join(_SPLITS_DIR, f"{name}.train.json")) save_json(ids[n_train:n_train + n_valid], join(_SPLITS_DIR, f"{name}.valid.json")) save_json(ids[-n_test:], join(_SPLITS_DIR, f"{name}.test.json"))
from os.path import join import pandas as pd from config import DATA_DIR from experiments.datadef import zoo from modapt.utils import load_json, save_json FRAMING_DATA_DIR = join(DATA_DIR, "framing_labeled") ISSUES = zoo.get_datadef("framing").domain_names if __name__ == "__main__": stats = [] for issue in ISSUES: print(">>", issue) data = load_json(join(FRAMING_DATA_DIR, f"{issue}_labeled.json")) ids = list(data.keys()) testsets = load_json(join(FRAMING_DATA_DIR, f"{issue}_test_sets.json")) testsets = {setname: set(ids) for setname, ids in testsets.items()} trainsets = {} # relevance train set: any sample not in test set relevance trainsets["relevance"] = list({ id for id in data if (id in ids and id not in testsets["relevance"]) }) # primary frame trainset: any sample not in testset primary frame, and has non null primary fram trainsets["primary_frame"] = list({ id
if arch.startswith("roberta"): valid_roberta_model(arch) if arch.startswith("logreg"): valid_logreg_model(arch) _OUTPUT_SAVE_DIR = join(OUTPUT_DIR, "power_analysis") makedirs(_OUTPUT_SAVE_DIR, exist_ok=True) # build tables holdout_source_to_table = {} fulltable = np.zeros((2, 2)) for holdout_source in _DATADEF.domain_names: table = np.zeros((2, 2)) arch1_preds = load_json( join( _get_model_dir(_ARCH1), f"{_VALID_OR_TEST}_preds", f"{holdout_source}.json", )) arch2_preds = load_json( join( _get_model_dir(_ARCH2), f"{_VALID_OR_TEST}_preds", f"{holdout_source}.json", )) ids = list(arch1_preds.keys()) for id in ids: arch1_correct = arch1_preds[id]["correct"] arch2_correct = arch2_preds[id]["correct"] if arch1_correct and arch2_correct: table[0][0] += 1 fulltable[0][0] += 1
def load_labelprops(split): return { issue: np.array(labelprops) for issue, labelprops in load_json( join(_LABELPROPS_DIR, f"{split}.json")).items() }
) for source in _DATADEF.domain_names ]) for datasetname in _DATASETS: rows = {} _DATADEF = get_datadef(datasetname) source2labelprops = _DATADEF.load_labelprops_func( "train" if "test" not in _METRIC_FILENAME else "test") # acc if naively choose most common label from each source most_common_acc = np.array([a.max() for a in source2labelprops.values()]).mean() rows["most_common"] = {"acc": round(most_common_acc, 3), "delta_std": "-"} lexicon_metrics = load_json( join(LEXICON_DIR, datasetname, _EXP_NAME, _METRIC_FILENAME)) lexicon_base_accs = np.array([ lexicon_metrics[_LEXICON_BASE_ARCH][source]["mean"]["valid_f1"] for source in _DATADEF.domain_names ]) rows[_LEXICON_BASE_ARCH] = { "acc": round(lexicon_base_accs.mean(), 3), "delta_std": "-", } for arch in _LEXICON_ARCHS: accs = np.array([ lexicon_metrics[arch][source]["mean"]["valid_f1"] for source in _DATADEF.domain_names ]) delta = accs - lexicon_base_accs rows[arch] = {
return [[firsthalf, secondhalf], [secondhalf, firsthalf]] # load samples, shuffle once, use this seeded shuffle order for all evals source2samples = {} for source in _DATADEF.domain_names: source2samples[source] = _DATADEF.load_splits_func([source], ["train"])["train"] _RNG.shuffle(source2samples[source]) # lexicon model predicting with gt & estimated labelprops _LEXICON_MODEL_PERFORMANCE_SAVE_PATH = join(_SAVE_DIR, f"{_LEXICON_ARCH}.json") if not exists(_LEXICON_MODEL_PERFORMANCE_SAVE_PATH): orig_metrics = load_json(join(_LEXICON_MODEL_ROOT, "mean_metrics.json")) gt_source2acc = { source: orig_metrics[source]["mean"]["valid_f1"] for source in _DATADEF.domain_names } gt_source2acc["mean"] = np.array(list(gt_source2acc.values())).mean() gt_source2acc["std"] = np.array(list(gt_source2acc.values())).std() notechnique_metrics = load_json( join( LEXICON_DIR, _DATASET_NAME, "holdout_source", "logreg", "mean_metrics.json" ) ) notechnique_source2acc = { source: notechnique_metrics[source]["mean"]["valid_f1"] for source in _DATADEF.domain_names