import pandas as pd from pandas.core.reshape import melt from eval_cleanups import load_data, decrange, RemoveDeviantRatings, combine_measures, BaselineCleaner from standard_cleanup import aggregate_ratings from noisify import * from scipy.stats import spearmanr, norm from progress import ProgressBar REPEATS = 100 heads, mods, whole, assoc = load_data() concatted = pd.concat([heads, mods], ignore_index=True) agg_concat_orig = combine_measures(aggregate_ratings(concatted))['mean'] agg_whole_orig = aggregate_ratings(whole)['mean'] # ------ RANDOMIZE BY ZSCORES TEST -------- def mass_outside(zstar): return 2 * norm().cdf(-abs(zstar)) output = [] # first zscores ZSCORES = [None] + decrange(4.0, 1.0, -0.25) NOISES = [0.0, 0.01, 0.05, 0.1, 0.25, 0.5] pb = ProgressBar(len(ZSCORES) * REPEATS * len(NOISES)) pb.errput() for zscore in ZSCORES:
# judgements good_compounds = reduce( set.intersection, [set(x.compound) for x in [heads, mods, whole, assoc]]) heads, mods, whole, assoc = [ d[d.compound.map(good_compounds.__contains__)] for d in [heads, mods, whole, assoc] ] return heads, mods, whole, assoc if __name__ == '__main__': # go ahead and sort the whole judgements and assoc measures heads, mods, whole_orig, assoc = load_data() whole = aggregate_ratings(whole_orig) concatted = pd.concat([heads, mods], ignore_index=True) concatted_uncleaned_together = combine_measures( aggregate_ratings(concatted), 'prod').sort('compound') setups = [] setups += [BaselineCleaner()] setups += [ RemoveDeviantSubjectCleaner(r) for r in decrange(0.10, 0.6, 0.05) ] setups += [RemoveDeviantRatings(z) for z in decrange(1.0, 4.0, 0.25)] #setups += [RebinCleaner(b) for b in ["1144477","1444447","1114777","1122233","1222223","1112333"]] setups += create_svd_cleaners(20) #setups += [FillCleaner(0), FillCleaner(1), FillCleaner(7)] results = []
good_compounds = reduce( set.intersection, [set(x.compound) for x in [heads, mods, whole, assoc]] ) heads, mods, whole, assoc = [ d[d.compound.map(good_compounds.__contains__)] for d in [heads, mods, whole, assoc] ] return heads, mods, whole, assoc if __name__ == '__main__': # go ahead and sort the whole judgements and assoc measures heads, mods, whole_orig, assoc = load_data() whole = aggregate_ratings(whole_orig) concatted = pd.concat([heads, mods], ignore_index=True) concatted_uncleaned_together = combine_measures(aggregate_ratings(concatted), 'prod').sort('compound') setups = [] setups += [BaselineCleaner()] setups += [RemoveDeviantSubjectCleaner(r) for r in decrange(0.10, 0.6, 0.05)] setups += [RemoveDeviantRatings(z) for z in decrange(1.0, 4.0, 0.25)] #setups += [RebinCleaner(b) for b in ["1144477","1444447","1114777","1122233","1222223","1112333"]] setups += create_svd_cleaners(20) #setups += [FillCleaner(0), FillCleaner(1), FillCleaner(7)] results = [] parameters = set() CONCAT_BEFORE = True