Exemplo n.º 1
0
def model_training_diff(model='glove', benchmark='Pereira2018-encoding'):
    trained = score(benchmark=benchmark, model=model)
    untrained = score(benchmark=benchmark, model=f'{model}-untrained')
    # get per-subject scores. do not need to worry about ceiling because it is identical due to same benchmark
    trained, untrained = trained.raw.raw, untrained.raw.raw
    trained, untrained = trained.groupby(
        'subject').median(), untrained.groupby('subject').median()
    # test difference
    t, p = ttest_ind(trained, untrained)
    logger.info(f"{model} difference on {benchmark}: t={t}, p={p}")
Exemplo n.º 2
0
    def test_dummy_model(self, num_layers):
        model_layers = [f'dummylayer{i}' for i in range(num_layers)]

        class DummyModel:
            def __call__(self, stimuli, layers):
                assert len(layers) == 1 and layers[0] in model_layers
                assembly = NeuroidAssembly(
                    np.ones([len(stimuli), 1]),
                    coords={
                        'stimulus_sentence':
                        ('presentation', stimuli['sentence']),
                        'dummy': ('presentation', ['dummy'] * len(stimuli)),
                        'neuroid_id':
                        ('neuroid', [model_layers.index(layers[0])]),
                        'layer': ('neuroid', layers),
                    },
                    dims=['presentation', 'neuroid'])
                assembly = attach_stimulus_set_meta(assembly, stimuli)
                return assembly

        s = score(model=f'dummy-{num_layers}layers',
                  model_impl=DummyModel(),
                  layers=model_layers,
                  benchmark='Pereira2018-encoding-min')
        assert s.sel(aggregation='center') == approx(0, abs=.5)
Exemplo n.º 3
0
def scores_per_region_and_model(
        models=('bert', 'bert-untrained'), benchmark='Pereira2018-encoding'):
    for model in models:
        model_scores = score(benchmark=benchmark, model=model)
        model_scores = model_scores.sel(
            aggregation='center').mean('experiment')

        render_mpl_table(model_scores, header_columns=0, col_width=2.0)
        pyplot.tight_layout()
        pyplot.savefig(Path(__file__).parent / f"table-regions-{model}.png")
Exemplo n.º 4
0
def print_Pereira2018():
    benchmark_identifier = 'Pereira2018-encoding'
    score_object = score(model='bert-base-uncased', benchmark=benchmark_identifier)
    score_object.attrs['raw'] = score_object.raw.raw  # language only
    print(f"## {benchmark_identifier} ##")
    print(
        f"  score\n"
        f"    - {len(score_object.raw['neuroid'])} neuroids"
    )
    _print_assembly_info(benchmark_identifier=benchmark_identifier)
    _print_stimulus_info(benchmark_identifier=benchmark_identifier)

    # num voxels
    for atlas in ['MD', 'DMN']:
        benchmark = benchmark_pool[benchmark_identifier]
        assembly = benchmark._target_assembly
        atlas_assembly = assembly.sel(atlas=atlas)
        # some subjects have nans for either experiment. The sum gets rid of those
        subject_assembly = atlas_assembly.sum('presentation')
        subject_assembly = subject_assembly.groupby('subject').count('neuroid')
        mean, std = subject_assembly.mean().values, subject_assembly.std().values
        print(
            f"  {atlas}: {len(atlas_assembly['neuroid'])} voxels ({mean:.0f}+-{std:.1f})"
        )
Exemplo n.º 5
0
def collect_scores(benchmark, models, normalize=True, score_hook=None):
    store_file = Path(__file__).parent / f"scores-{benchmark}-{'normalized' if normalize else 'raw'}" \
                                         f"{'-hook' if score_hook else ''}.csv"
    stored = False
    if store_file.is_file():
        data = pd.read_csv(store_file)
        data = data[data['model'].isin(models)]
        stored = True
    if not stored and benchmark.startswith('overall'):
        metric = ('-' + benchmark.split('-')[-1]) if '-' in benchmark else ''
        data = [
            collect_scores(benchmark=f"{part}{metric}",
                           models=models,
                           normalize=normalize)
            for part in (overall_neural_benchmarks if benchmark.startswith(
                'overall_neural') else glue_benchmarks if benchmark ==
                         "overall_glue" else overall_benchmarks)
        ]
        data = reduce(lambda left, right: pd.concat([left, right]), data)
        data = average_adjacent(data)
        data = choose_best_scores(data).dropna(
        )  # need to choose best layer per benchmark here before averaging
        data['score'][
            data['score'] >=
            1] = 1  # more than 100% makes no sense and might skew averaging
        data = data.groupby([
            'model'
        ]).mean().reset_index()  # mean across benchmarks per model-layer
        data['layer'] = 'combined'
        data['benchmark'] = benchmark
    elif not stored:
        data, missing_models = [], []
        previous_resultcaching_cachedonly = os.getenv(
            'RESULTCACHING_CACHEDONLY', '0')
        os.environ['RESULTCACHING_CACHEDONLY'] = '1'
        for model in tqdm(models, desc='model scores'):
            try:
                model_scores = score(benchmark=benchmark, model=model)
                if score_hook:
                    model_scores = score_hook(model_scores)
            except NotCachedError:
                missing_models.append(model)
                continue
            if not normalize:
                model_scores = model_scores.raw
            adjunct_columns = list(set(model_scores.dims) - {'aggregation'})
            for adjunct_values in itertools.product(
                    *
                [model_scores[column].values for column in adjunct_columns]):
                adjunct_values = dict(zip(adjunct_columns, adjunct_values))
                current_score = model_scores.sel(**adjunct_values)
                center, error = get_score_center_err(current_score)
                data.append({
                    **adjunct_values,
                    **{
                        'benchmark': benchmark,
                        'model': model,
                        'score': center,
                        'error': error
                    }
                })
        if missing_models:
            logger.warning(
                f"No score cached for {len(missing_models)} models {missing_models} on benchmark {benchmark}"
            )
        data = pd.DataFrame(data)
        if any(
                benchmark.startswith(performance_benchmark)
                for performance_benchmark in ['wikitext', 'glue']):
            data['layer'] = -1
            data['error'] = 0  # nans will otherwise be dropped later on
        if benchmark.startswith('wikitext'):
            data = data[data['measure'] == 'test_loss']
        if benchmark == 'glue-mnli':  # only consider mnli (not mnli-mm)
            data = data[data['eval_task'] == 'mnli']
        os.environ[
            'RESULTCACHING_CACHEDONLY'] = previous_resultcaching_cachedonly
    non_overlap = list(set(data['model']).symmetric_difference(set(models)))
    if len(non_overlap) > 0:
        logger.warning(
            f"Non-overlapping identifiers in {benchmark}: {sorted(non_overlap)}"
        )
    if not stored:
        data.to_csv(store_file, index=False)
    return data
Exemplo n.º 6
0
 def test_story_model(self, model, stimulus_set):
     scores = score(
         model=model,
         stimulus_set='naturalistic-neural-reduced.{}'.format(stimulus_set))
     assert 'region' in scores.aggregation
     assert scores.aggregation.sel(aggregation='center').shape[0] == 44
Exemplo n.º 7
0
def _print_score_info(benchmark_identifier, neuroid_dim='neuroid'):
    score_object = score(model='bert-base-uncased', benchmark=benchmark_identifier)
    print(
        f"  score\n"
        f"    - {len(score_object.raw[neuroid_dim])} neuroids"
    )