def model_training_diff(model='glove', benchmark='Pereira2018-encoding'): trained = score(benchmark=benchmark, model=model) untrained = score(benchmark=benchmark, model=f'{model}-untrained') # get per-subject scores. do not need to worry about ceiling because it is identical due to same benchmark trained, untrained = trained.raw.raw, untrained.raw.raw trained, untrained = trained.groupby( 'subject').median(), untrained.groupby('subject').median() # test difference t, p = ttest_ind(trained, untrained) logger.info(f"{model} difference on {benchmark}: t={t}, p={p}")
def test_dummy_model(self, num_layers): model_layers = [f'dummylayer{i}' for i in range(num_layers)] class DummyModel: def __call__(self, stimuli, layers): assert len(layers) == 1 and layers[0] in model_layers assembly = NeuroidAssembly( np.ones([len(stimuli), 1]), coords={ 'stimulus_sentence': ('presentation', stimuli['sentence']), 'dummy': ('presentation', ['dummy'] * len(stimuli)), 'neuroid_id': ('neuroid', [model_layers.index(layers[0])]), 'layer': ('neuroid', layers), }, dims=['presentation', 'neuroid']) assembly = attach_stimulus_set_meta(assembly, stimuli) return assembly s = score(model=f'dummy-{num_layers}layers', model_impl=DummyModel(), layers=model_layers, benchmark='Pereira2018-encoding-min') assert s.sel(aggregation='center') == approx(0, abs=.5)
def scores_per_region_and_model( models=('bert', 'bert-untrained'), benchmark='Pereira2018-encoding'): for model in models: model_scores = score(benchmark=benchmark, model=model) model_scores = model_scores.sel( aggregation='center').mean('experiment') render_mpl_table(model_scores, header_columns=0, col_width=2.0) pyplot.tight_layout() pyplot.savefig(Path(__file__).parent / f"table-regions-{model}.png")
def print_Pereira2018(): benchmark_identifier = 'Pereira2018-encoding' score_object = score(model='bert-base-uncased', benchmark=benchmark_identifier) score_object.attrs['raw'] = score_object.raw.raw # language only print(f"## {benchmark_identifier} ##") print( f" score\n" f" - {len(score_object.raw['neuroid'])} neuroids" ) _print_assembly_info(benchmark_identifier=benchmark_identifier) _print_stimulus_info(benchmark_identifier=benchmark_identifier) # num voxels for atlas in ['MD', 'DMN']: benchmark = benchmark_pool[benchmark_identifier] assembly = benchmark._target_assembly atlas_assembly = assembly.sel(atlas=atlas) # some subjects have nans for either experiment. The sum gets rid of those subject_assembly = atlas_assembly.sum('presentation') subject_assembly = subject_assembly.groupby('subject').count('neuroid') mean, std = subject_assembly.mean().values, subject_assembly.std().values print( f" {atlas}: {len(atlas_assembly['neuroid'])} voxels ({mean:.0f}+-{std:.1f})" )
def collect_scores(benchmark, models, normalize=True, score_hook=None): store_file = Path(__file__).parent / f"scores-{benchmark}-{'normalized' if normalize else 'raw'}" \ f"{'-hook' if score_hook else ''}.csv" stored = False if store_file.is_file(): data = pd.read_csv(store_file) data = data[data['model'].isin(models)] stored = True if not stored and benchmark.startswith('overall'): metric = ('-' + benchmark.split('-')[-1]) if '-' in benchmark else '' data = [ collect_scores(benchmark=f"{part}{metric}", models=models, normalize=normalize) for part in (overall_neural_benchmarks if benchmark.startswith( 'overall_neural') else glue_benchmarks if benchmark == "overall_glue" else overall_benchmarks) ] data = reduce(lambda left, right: pd.concat([left, right]), data) data = average_adjacent(data) data = choose_best_scores(data).dropna( ) # need to choose best layer per benchmark here before averaging data['score'][ data['score'] >= 1] = 1 # more than 100% makes no sense and might skew averaging data = data.groupby([ 'model' ]).mean().reset_index() # mean across benchmarks per model-layer data['layer'] = 'combined' data['benchmark'] = benchmark elif not stored: data, missing_models = [], [] previous_resultcaching_cachedonly = os.getenv( 'RESULTCACHING_CACHEDONLY', '0') os.environ['RESULTCACHING_CACHEDONLY'] = '1' for model in tqdm(models, desc='model scores'): try: model_scores = score(benchmark=benchmark, model=model) if score_hook: model_scores = score_hook(model_scores) except NotCachedError: missing_models.append(model) continue if not normalize: model_scores = model_scores.raw adjunct_columns = list(set(model_scores.dims) - {'aggregation'}) for adjunct_values in itertools.product( * [model_scores[column].values for column in adjunct_columns]): adjunct_values = dict(zip(adjunct_columns, adjunct_values)) current_score = model_scores.sel(**adjunct_values) center, error = get_score_center_err(current_score) data.append({ **adjunct_values, **{ 'benchmark': benchmark, 'model': model, 'score': center, 'error': error } }) if missing_models: logger.warning( f"No score cached for {len(missing_models)} models {missing_models} on benchmark {benchmark}" ) data = pd.DataFrame(data) if any( benchmark.startswith(performance_benchmark) for performance_benchmark in ['wikitext', 'glue']): data['layer'] = -1 data['error'] = 0 # nans will otherwise be dropped later on if benchmark.startswith('wikitext'): data = data[data['measure'] == 'test_loss'] if benchmark == 'glue-mnli': # only consider mnli (not mnli-mm) data = data[data['eval_task'] == 'mnli'] os.environ[ 'RESULTCACHING_CACHEDONLY'] = previous_resultcaching_cachedonly non_overlap = list(set(data['model']).symmetric_difference(set(models))) if len(non_overlap) > 0: logger.warning( f"Non-overlapping identifiers in {benchmark}: {sorted(non_overlap)}" ) if not stored: data.to_csv(store_file, index=False) return data
def test_story_model(self, model, stimulus_set): scores = score( model=model, stimulus_set='naturalistic-neural-reduced.{}'.format(stimulus_set)) assert 'region' in scores.aggregation assert scores.aggregation.sel(aggregation='center').shape[0] == 44
def _print_score_info(benchmark_identifier, neuroid_dim='neuroid'): score_object = score(model='bert-base-uncased', benchmark=benchmark_identifier) print( f" score\n" f" - {len(score_object.raw[neuroid_dim])} neuroids" )