Пример #1
0
def read_words(candidate, stimulus_set, reset_column='sentence_id', copy_columns=(), average_sentence=False):
    """
    Pass a `stimulus_set` through a model `candidate`.
    In contrast to the `listen_to` function, this function operates on a word-based `stimulus_set`.
    """
    # Input: stimulus_set = pandas df, col 1 with sentence ID and 2nd col as word.
    activations = []
    for i, reset_id in enumerate(ordered_set(stimulus_set[reset_column].values)):
        part_stimuli = stimulus_set[stimulus_set[reset_column] == reset_id]
        # stimulus_ids = part_stimuli['stimulus_id']
        sentence_stimuli = StimulusSet({'sentence': ' '.join(part_stimuli['word']),
                                        reset_column: list(set(part_stimuli[reset_column]))})
        sentence_stimuli.name = f"{stimulus_set.name}-{reset_id}"
        sentence_activations = candidate(stimuli=sentence_stimuli, average_sentence=average_sentence)
        for column in copy_columns:
            sentence_activations[column] = ('presentation', part_stimuli[column])
        activations.append(sentence_activations)
    model_activations = merge_data_arrays(activations)
    # merging does not maintain stimulus order. the following orders again
    idx = [model_activations['stimulus_id'].values.tolist().index(stimulus_id) for stimulus_id in
           itertools.chain.from_iterable(s['stimulus_id'].values for s in activations)]
    assert len(set(idx)) == len(idx), "Found duplicate indices to order activations"
    model_activations = model_activations[{'presentation': idx}]

    return model_activations
Пример #2
0
 def from_paths(self, *args, **kwargs):
     raw_activations = super(TemporalExtractor, self).from_paths(*args, **kwargs)
     # introduce time dimension
     regions = defaultdict(list)
     for layer in set(raw_activations['layer'].values):
         match = re.match(r'(([^-]*)\..*|logits|avgpool)-t([0-9]+)', layer)
         region, timestep = match.group(2) if match.group(2) else match.group(1), match.group(3)
         stripped_layer = match.group(1)
         regions[region].append((layer, stripped_layer, timestep))
     activations = {}
     for region, time_layers in regions.items():
         for (full_layer, stripped_layer, timestep) in time_layers:
             region_time_activations = raw_activations.sel(layer=full_layer)
             region_time_activations['layer'] = 'neuroid', [stripped_layer] * len(region_time_activations['neuroid'])
             activations[(region, timestep)] = region_time_activations
     for key, key_activations in activations.items():
         region, timestep = key
         key_activations['region'] = 'neuroid', [region] * len(key_activations['neuroid'])
         activations[key] = NeuroidAssembly([key_activations.values], coords={
             **{coord: (dims, values) for coord, dims, values in walk_coords(activations[key])
                if coord != 'neuroid_id'},  # otherwise, neuroid dim will be as large as before with nans
             **{'time_step': [int(timestep)]}
         }, dims=['time_step'] + list(key_activations.dims))
     activations = list(activations.values())
     activations = merge_data_arrays(activations)
     # rebuild neuroid_id without timestep
     neuroid_id = [".".join([f"{value}" for value in values]) for values in zip(*[
         activations[coord].values for coord in ['model', 'region', 'neuroid_num']])]
     activations['neuroid_id'] = 'neuroid', neuroid_id
     return activations
Пример #3
0
def load_time_meta():
    data_dir = neural_data_dir / 'StoriesData_Dec2018' / 'stories_textgridsbyJeanne'
    files = data_dir.glob("*TextGrid*")
    time_to_words = []
    for file in files:
        textgrid = TextGrid.load(file)
        words = [tier for tier in textgrid.tiers if tier.nameid == 'words'][0]
        rows = defaultdict(list)
        for (time_start, time_end, word) in words.simple_transcript:
            rows['time_start'].append(float(time_start))
            rows['time_end'].append(float(time_end))
            rows['word'].append(word)
        story_index = int(file.stem)
        story = stories_meta.sel(number=story_index).values
        story = next(iter(set(story)))  # Boar was read twice
        rows = DataArray(rows['word'],
                         coords={
                             'filepath':
                             ('time_bin', [file.name] * len(rows['word'])),
                             'story':
                             ('time_bin', [story] * len(rows['word'])),
                             'time_start': ('time_bin', rows['time_start']),
                             'time_end': ('time_bin', rows['time_end']),
                         },
                         dims=['time_bin'])
        gather_indexes(rows)
        time_to_words.append(rows)
    time_to_words = merge_data_arrays(time_to_words)
    return time_to_words
Пример #4
0
 def merge(cls, *scores):
     """
     Merges the raw values in addition to the score assemblies.
     """
     result = merge_data_arrays(scores)
     raws = [
         score.attrs[cls.RAW_VALUES_KEY] for score in scores
         if cls.RAW_VALUES_KEY in score.attrs
     ]
     if len(raws) > 0:
         try:
             raw = merge_data_arrays(raws)
             result.attrs[cls.RAW_VALUES_KEY] = raw
         except Exception as e:
             warnings.warn("failed to merge raw values: " + str(e))
     return result
Пример #5
0
def get_activations_for_sentence(model_name, layers, sentences):
    model = load_model(model_name)
    activations = []
    for sentence in sentences:
        sentence_activations = model.get_activations([sentence], layers=layers)
        activations.append(sentence_activations)
    activations = merge_data_arrays(activations)
    return activations
Пример #6
0
 def look_at_cached(self, model_identifier, stimuli_identifier, stimuli):
     responses = self.activations_model(stimuli,
                                        layers=self.recording_layers)
     # map time
     regions = set(responses['region'].values)
     if len(regions) > 1:
         raise NotImplementedError(
             "cannot handle more than one simultaneous region")
     region = list(regions)[0]
     time_bins = [
         self.time_mapping[region][timestep]
         if timestep in self.time_mapping[region] else (None, None)
         for timestep in responses['time_step'].values
     ]
     responses['time_bin_start'] = 'time_step', [
         time_bin[0] for time_bin in time_bins
     ]
     responses['time_bin_end'] = 'time_step', [
         time_bin[1] for time_bin in time_bins
     ]
     responses = NeuroidAssembly(responses.rename({'time_step':
                                                   'time_bin'}))
     responses = responses[{
         'time_bin': [
             not np.isnan(time_start)
             for time_start in responses['time_bin_start']
         ]
     }]
     # select time
     time_responses = []
     for time_bin in tqdm(self.recording_time_bins,
                          desc='CORnet-time to recording time'):
         time_bin = time_bin if not isinstance(
             time_bin, np.ndarray) else time_bin.tolist()
         time_bin_start, time_bin_end = time_bin
         nearest_start = find_nearest(responses['time_bin_start'].values,
                                      time_bin_start)
         bin_responses = responses.sel(time_bin_start=nearest_start)
         bin_responses = NeuroidAssembly(
             bin_responses.values,
             coords={
                 **{
                     coord: (dims, values)
                     for coord, dims, values in walk_coords(bin_responses) if coord not in [
                         'time_bin_level_0', 'time_bin_end'
                     ]
                 },
                 **{
                     'time_bin_start': ('time_bin', [time_bin_start]),
                     'time_bin_end': ('time_bin', [time_bin_end])
                 }
             },
             dims=bin_responses.dims)
         time_responses.append(bin_responses)
     responses = merge_data_arrays(time_responses)
     return responses
Пример #7
0
    def extrapolate_neuroid(self, ceilings):
        # figure out how many extrapolation x points we have. E.g. for Pereira, not all combinations are possible
        subject_subsamples = list(sorted(set(ceilings['num_subjects'].values)))
        rng = RandomState(0)
        bootstrap_params = []
        for bootstrap in range(self.num_bootstraps):
            bootstrapped_scores = []
            for num_subjects in subject_subsamples:
                num_scores = ceilings.sel(num_subjects=num_subjects)
                # the sub_subjects dimension creates nans, get rid of those
                num_scores = num_scores.dropna(f'sub_{self.subject_column}')
                assert set(num_scores.dims) == {f'sub_{self.subject_column}', 'split'} or \
                       set(num_scores.dims) == {f'sub_{self.subject_column}'}
                # choose from subject subsets and the splits therein, with replacement for variance
                choices = num_scores.values.flatten()
                bootstrapped_score = rng.choice(choices,
                                                size=len(choices),
                                                replace=True)
                bootstrapped_scores.append(np.mean(bootstrapped_score))

            try:
                params = self.fit(subject_subsamples, bootstrapped_scores)
            except RuntimeError:  # optimal parameters not found
                params = [np.nan, np.nan]
            params = DataAssembly([params],
                                  coords={
                                      'bootstrap': [bootstrap],
                                      'param': ['v0', 'tau0']
                                  },
                                  dims=['bootstrap', 'param'])
            bootstrap_params.append(params)
        bootstrap_params = merge_data_arrays(bootstrap_params)
        # find endpoint and error
        asymptote_threshold = .0005
        interpolation_xs = np.arange(1000)
        ys = np.array([
            v(interpolation_xs, *params) for params in bootstrap_params.values
            if not np.isnan(params).any()
        ])
        median_ys = np.median(ys, axis=0)
        diffs = np.diff(median_ys)
        end_x = np.where(diffs < asymptote_threshold)[0].min(
        )  # first x where increase smaller than threshold
        # put together
        center = np.median(np.array(bootstrap_params)[:, 0])
        error = ci_error(ys[:, end_x], center=center)
        score = Score(
            [center] + list(error),
            coords={'aggregation': ['center', 'error_low', 'error_high']},
            dims=['aggregation'])
        score.attrs['raw'] = ceilings
        score.attrs['bootstrapped_params'] = bootstrap_params
        score.attrs['endpoint_x'] = DataAssembly(end_x)
        return score
Пример #8
0
 def predict(self, source):
     predictions = []
     for time_bin_start, time_bin_end in source['time_bin'].values:
         time_source = source.sel(time_bin=(time_bin_start, time_bin_end))
         prediction = self._regression.predict(time_source)
         prediction = prediction.expand_dims('time_bin_start').expand_dims(
             'time_bin_end')
         prediction['time_bin_start'] = [time_bin_start]
         prediction['time_bin_end'] = [time_bin_end]
         prediction = prediction.stack(
             time_bin=['time_bin_start', 'time_bin_end'])
         predictions.append(prediction)
     return merge_data_arrays(predictions)
Пример #9
0
 def look_at(self, stimuli, number_of_trials=1):
     responses = self._layer_model.look_at(stimuli)
     time_responses = []
     self._logger.debug(f'Repeating single assembly across time bins {self._time_bins}')
     for time_bin in self._time_bins:
         time_bin = time_bin if not isinstance(time_bin, np.ndarray) else time_bin.tolist()
         time_bin_start, time_bin_end = time_bin
         bin_responses = responses.expand_dims('time_bin_start').expand_dims('time_bin_end')
         bin_responses['time_bin_start'] = [time_bin_start]
         bin_responses['time_bin_end'] = [time_bin_end]
         bin_responses = bin_responses.stack(time_bin=['time_bin_start', 'time_bin_end'])
         time_responses.append(bin_responses)
     responses = merge_data_arrays(time_responses)
     if len(self._time_bins) == 1:
         responses = responses.squeeze('time_bin')
     return responses
Пример #10
0
def cross_correlation(prediction, target, cross, correlation):
    assert (prediction[cross] == target[cross]).all()
    scores = []
    coords = [coord for coord, dims, values in walk_coords(target[cross])]
    for cross_value in target[cross].values:
        _prediction = prediction.sel(**{cross: cross_value})
        _target = target.sel(**{cross: cross_value})
        score = correlation(_prediction, _target)
        for coord, coord_value in zip(coords, cross_value):
            score = score.expand_dims(coord)
            score[coord] = [coord_value]
        score = score.stack(**{cross: coords})
        scores.append(score)
    score = merge_data_arrays(scores)
    score = apply_aggregate(lambda score: score.mean(cross), score)
    return score
Пример #11
0
def listen_to(candidate, stimulus_set, reset_column='story', average_sentence=True):
    """
    Pass a `stimulus_set` through a model `candidate`.
    Operates on a sentence-based `stimulus_set`.
    """
    activations = []
    for story in ordered_set(stimulus_set[reset_column].values):
        story_stimuli = stimulus_set[stimulus_set[reset_column] == story]
        story_stimuli.name = f"{stimulus_set.name}-{story}"
        story_activations = candidate(stimuli=story_stimuli, average_sentence=average_sentence)
        activations.append(story_activations)
    model_activations = merge_data_arrays(activations)
    # merging does not maintain stimulus order. the following orders again
    idx = [model_activations['stimulus_id'].values.tolist().index(stimulus_id) for stimulus_id in
           itertools.chain.from_iterable(s['stimulus_id'].values for s in activations)]
    assert len(set(idx)) == len(idx), "Found duplicate indices to order activations"
    model_activations = model_activations[{'presentation': idx}]
    return model_activations
Пример #12
0
 def look_at(self, stimuli):
     responses = self._layer_model.look_at(stimuli)
     time_responses = []
     for time_bin in self._time_bins:
         time_bin = time_bin if not isinstance(
             time_bin, np.ndarray) else time_bin.tolist()
         time_bin_start, time_bin_end = time_bin
         bin_responses = responses.expand_dims(
             'time_bin_start').expand_dims('time_bin_end')
         bin_responses['time_bin_start'] = [time_bin_start]
         bin_responses['time_bin_end'] = [time_bin_end]
         bin_responses = bin_responses.stack(
             time_bin=['time_bin_start', 'time_bin_end'])
         time_responses.append(bin_responses)
     responses = merge_data_arrays(time_responses)
     if len(self._time_bins) == 1:
         responses = responses.squeeze('time_bin')
     return responses
Пример #13
0
 def merge(cls, *scores, ignore_exceptions=False):
     """
     Merges the raw values in addition to the score assemblies.
     """
     try:
         result = merge_data_arrays(scores)
         raws = [
             score.attrs[cls.RAW_VALUES_KEY] for score in scores
             if cls.RAW_VALUES_KEY in score.attrs
         ]
         if len(raws) > 0:
             raw = Score.merge(*raws, ignore_exceptions=True)
             result.attrs[cls.RAW_VALUES_KEY] = raw
     except Exception as e:
         if ignore_exceptions:
             warnings.warn("failed to merge raw values: " + str(e))
             return None
         else:
             raise e
     return result
Пример #14
0
def read_words(candidate, stimulus_set): # This is a new version of the listen_to_stories function
    # Input: stimulus_set = pandas df, col 1 with sentence ID and 2nd col as word.
    activations = []
    for i, sentence_id in enumerate(ordered_set(stimulus_set['sentence_id'].values)):
        sentence_stimuli = stimulus_set[stimulus_set['sentence_id'] == sentence_id]
        sentence_stimuli = StimulusSet({'sentence': ' '.join(sentence_stimuli['word']),
                                        'sentence_id': list(set(sentence_stimuli['sentence_id']))})
        sentence_stimuli.name = f"{stimulus_set.name}-{sentence_id}"
        sentence_activations = candidate(stimuli=sentence_stimuli)
        sentence_activations['stimulus_id'] = ('presentation', 8 * i + np.arange(0, 8))
        sentence_activations['sentence_id'] = ('presentation', [sentence_id] * 8)
        activations.append(sentence_activations)
    model_activations = merge_data_arrays(activations)
    # merging does not maintain stimulus order. the following orders again
    idx = [model_activations['stimulus_id'].values.tolist().index(stimulus_id) for stimulus_id in
           itertools.chain.from_iterable(s['stimulus_id'].values for s in activations)]
    assert len(set(idx)) == len(idx), "Found duplicate indices to order activations"
    model_activations = model_activations[{'presentation': idx}]
    
    return model_activations
Пример #15
0
def load_Pereira2018():
    data_dir = neural_data_dir / "Pereira2018"
    experiment2, experiment3 = "243sentences.mat", "384sentences.mat"
    stimuli = {}  # experiment -> stimuli
    assemblies = []
    subject_directories = [d for d in data_dir.iterdir() if d.is_dir()]
    for subject_directory in tqdm(subject_directories, desc="subjects"):
        for experiment_filename in [experiment2, experiment3]:
            data_file = subject_directory / f"examples_{experiment_filename}"
            if not data_file.is_file():
                _logger.debug(
                    f"{subject_directory} does not contain {experiment_filename}"
                )
                continue
            data = scipy.io.loadmat(str(data_file))

            # assembly
            assembly = data['examples']
            meta = data['meta']
            assembly = NeuroidAssembly(
                assembly,
                coords={
                    'experiment': ('presentation',
                                   [os.path.splitext(experiment_filename)[0]] *
                                   assembly.shape[0]),
                    'stimulus_num':
                    ('presentation', list(range(assembly.shape[0]))),
                    'passage_index':
                    ('presentation', data['labelsPassageForEachSentence'][:,
                                                                          0]),
                    'passage_label': ('presentation', [
                        data['keyPassages'][index - 1, 0][0]
                        for index in data['labelsPassageForEachSentence'][:, 0]
                    ]),
                    'passage_category': ('presentation', [
                        data['keyPassageCategory']
                        [0,
                         data['labelsPassageCategory'][index - 1, 0] - 1][0][0]
                        for index in data['labelsPassageForEachSentence']
                    ]),
                    'subject':
                    ('neuroid', [subject_directory.name] * assembly.shape[1]),
                    'voxel_num': ('neuroid', list(range(assembly.shape[1]))),
                    'AAL_roi_index':
                    ('neuroid', meta[0][0]['roiMultimaskAAL'][:, 0]),
                },
                dims=['presentation', 'neuroid'])
            stimulus_id = _build_id(assembly, ['experiment', 'stimulus_num'])
            assembly['stimulus_id'] = 'presentation', stimulus_id
            # set story for compatibility
            assembly['story'] = 'presentation', _build_id(
                assembly, ['experiment', 'passage_category'])
            assembly['neuroid_id'] = 'neuroid', _build_id(
                assembly, ['subject', 'voxel_num'])
            assemblies.append(assembly)

            # stimuli
            if experiment_filename not in stimuli:
                sentences = data['keySentences']
                sentences = [sentence[0] for sentence in sentences[:, 0]]
                stimuli[experiment_filename] = {
                    'sentence': sentences,
                    'sentence_num': list(range(len(sentences))),
                    'stimulus_id': stimulus_id,
                    'experiment': assembly['experiment'].values,
                    'story': assembly['story'].values,
                }
                for copy_coord in [
                        'experiment', 'story', 'passage_index',
                        'passage_label', 'passage_category'
                ]:
                    stimuli[experiment_filename][copy_coord] = assembly[
                        copy_coord].values

    _logger.debug(f"Merging {len(assemblies)} assemblies")
    assembly = merge_data_arrays(assemblies)

    _logger.debug("Creating StimulusSet")
    combined_stimuli = {}
    for key in stimuli[experiment2]:
        combined_stimuli[key] = np.concatenate(
            (stimuli[experiment2][key], stimuli[experiment3][key]))
    stimuli = StimulusSet(combined_stimuli)
    stimuli.name = "Pereira2018"
    assembly.attrs['stimulus_set'] = stimuli
    return assembly
Пример #16
0
def load_Pereira2018_Blank_languageresiduals():
    # hijack the corresponding encoding benchmark to regress, but then store residuals instead of correlate
    from neural_nlp.benchmarks.neural import PereiraEncoding
    benchmark = PereiraEncoding()
    assembly, cross = benchmark._target_assembly, benchmark._cross
    residuals = []

    def store_residuals(nonlanguage_prediction, language_target):
        residual = language_target - nonlanguage_prediction
        residuals.append(residual)
        return Score([0],
                     coords={'neuroid_id': ('neuroid', [0])},
                     dims=['neuroid'])  # dummy score

    pseudo_metric = CrossRegressedCorrelation(regression=linear_regression(
        xarray_kwargs=dict(stimulus_coord='stimulus_id')),
                                              correlation=store_residuals,
                                              crossvalidation_kwargs=dict(
                                                  splits=5,
                                                  kfold=True,
                                                  split_coord='stimulus_id',
                                                  stratification_coord=None))

    # separate language from non-language networks
    language_assembly = assembly[{
        'neuroid': [
            atlas in ['DMN', 'MD', 'language']
            for atlas in assembly['atlas'].values
        ]
    }]
    nonlanguage_assembly = assembly[{
        'neuroid': [
            atlas in ['visual', 'auditory']
            for atlas in assembly['atlas'].values
        ]
    }]

    # run
    def apply_cross(source_assembly, target_assembly):
        # filter experiment
        source_assembly = source_assembly[{
            'presentation': [
                stimulus_id in target_assembly['stimulus_id'].values
                for stimulus_id in source_assembly['stimulus_id'].values
            ]
        }]
        assert all(source_assembly['stimulus_id'].values ==
                   target_assembly['stimulus_id'].values)
        # filter subjects that have not done this experiment
        source_assembly = source_assembly.dropna('neuroid')
        # for the target assembly, it's going to become awkward if we just drop those neuroids.
        # instead, we set them to zero which makes for simple zero regression weights.
        target_assembly = target_assembly.fillna(0)
        # this will regress from joint visual+auditory neural space to one of the language networks
        return pseudo_metric(source_assembly, target_assembly)

    cross(language_assembly,
          apply=lambda cross_assembly: apply_cross(nonlanguage_assembly,
                                                   cross_assembly))

    # combine residuals
    assert len(
        residuals
    ) == 5 * 2 * 3  # 5-fold CV, 2 experiments, 3 language brain networks
    # ensure uniqueness
    neuroid_ids, stimulus_ids = [], []
    for residual in residuals:
        neuroid_ids += residual['neuroid_id'].values.tolist()
        stimulus_ids += residual['stimulus_id'].values.tolist()
    assert len(neuroid_ids) == len(language_assembly['neuroid']) * 5 * 2
    assert len(set(neuroid_ids)) == len(
        set(language_assembly['neuroid_id'].values))
    assert len(stimulus_ids) == len(language_assembly['presentation']) * 3
    assert len(set(stimulus_ids)) == len(
        set(language_assembly['stimulus_id'].values))
    residuals = merge_data_arrays(residuals)
    residuals = type(language_assembly)(residuals)
    residuals.attrs['stimulus_set'] = assembly.stimulus_set
    return residuals
Пример #17
0
def _merge_voxel_meta(data, meta, bold_shift_seconds):
    data_missing = set(meta['story'].values) - set(data['story'].values)
    if data_missing:
        warnings.warn(f"Stories missing from the data: {data_missing}")
    meta_missing = set(data['story'].values) - set(meta['story'].values)
    if meta_missing:
        warnings.warn(f"Stories missing from the meta: {meta_missing}")

    ignored_words = [None, '', '<s>', '</s>', '<s']
    annotated_data = []
    for story in tqdm(ordered_set(data['story'].values), desc='merge meta'):
        if story not in meta['story'].values:
            continue
        story_meta = meta.sel(story=story)
        story_meta = story_meta.sortby('time_end')

        story_data = data.sel(story=story).stack(timepoint=['timepoint_value'])
        story_data = story_data.sortby('timepoint_value')
        timepoints = story_data['timepoint_value'].values.tolist()
        assert is_sorted(timepoints)
        timepoints = [
            timepoint - bold_shift_seconds for timepoint in timepoints
        ]
        sentences = []
        last_timepoint = -np.inf
        for timepoint in timepoints:
            if last_timepoint >= max(story_meta['time_end'].values):
                break
            if timepoint <= 0:
                sentences.append(None)
                continue  # ignore fixation period
            timebin_meta = [
                last_timepoint < end <= timepoint
                for end in story_meta['time_end'].values
            ]
            timebin_meta = story_meta[{'time_bin': timebin_meta}]
            sentence = ' '.join(word.strip() for word in timebin_meta.values
                                if word not in ignored_words)
            sentence = sentence.lower().strip()
            # quick-fixes
            if story == 'Boar' and sentence == 'interactions the the':  # Boar duplicate
                sentence = 'interactions the'
            if story == 'KingOfBirds' and sentence == 'the fact that the larger':  # missing word in TextGrid
                sentence = 'earth ' + sentence
            if story == 'MrSticky' and sentence == 'worry don\'t worry i went extra slowly since it\'s':
                sentence = 'don\'t worry i went extra slowly since it\'s'
            sentences.append(sentence)
            last_timepoint = timebin_meta['time_end'].values[-1]
        sentence_index = [
            i for i, sentence in enumerate(sentences) if sentence
        ]
        sentences = np.array(sentences)[sentence_index]
        if story not in ['Boar', 'KingOfBirds',
                         'MrSticky']:  # ignore quick-fixes
            annotated_sentence = ' '.join(sentences)
            meta_sentence = ' '.join(word.strip() for word in story_meta.values if word not in ignored_words) \
                .lower().strip()
            assert annotated_sentence == meta_sentence
        # re-interpret timepoints as stimuli
        coords = {}
        for coord_name, dims, coord_value in walk_coords(story_data):
            dims = [
                dim if not dim.startswith('timepoint') else 'presentation'
                for dim in dims
            ]
            # discard the timepoints for which the stimulus did not change (empty word)
            coord_value = coord_value if not array_is_element(
                dims, 'presentation') else coord_value[sentence_index]
            coords[coord_name] = dims, coord_value
        coords = {
            **coords,
            **{
                'stimulus_sentence': ('presentation', sentences)
            }
        }
        story_data = story_data[{
            dim: slice(None) if dim != 'timepoint' else sentence_index
            for dim in story_data.dims
        }]
        dims = [
            dim if not dim.startswith('timepoint') else 'presentation'
            for dim in story_data.dims
        ]
        story_data = xr.DataArray(story_data.values, coords=coords, dims=dims)
        story_data['story'] = 'presentation', [story] * len(
            story_data['presentation'])
        gather_indexes(story_data)
        annotated_data.append(story_data)
    annotated_data = merge_data_arrays(annotated_data)
    return annotated_data
Пример #18
0
def load_Pereira2018_Blank(version='base'):
    reference_data = load_Pereira2018()

    data_dir = neural_data_dir / ("Pereira2018_Blank" +
                                  ("_langonly" if version != 'base' else ""))
    experiments = {'n72': "243sentences", 'n96': "384sentences"}
    assemblies = []
    subjects = [
        '018', '199', '215', '288', '289', '296', '343', '366', '407', '426'
    ]
    for subject in tqdm(subjects, desc="subjects"):
        subject_assemblies = []
        for experiment_filepart, experiment_name in experiments.items():
            filepath = data_dir / f"{'ICA_' if version != 'base' else ''}" \
                                  f"{subject}_complang_passages_{experiment_filepart}_persent.mat"
            if not filepath.is_file():
                _logger.debug(
                    f"Subject {subject} did not run {experiment_name}: {filepath} does not exist"
                )
                continue
            data = scipy.io.loadmat(str(filepath))
            if version != 'base':
                data = data['x'][0, 0]

            # construct assembly
            assembly = data['data' if version == 'base' else f'data{version}']
            neuroid_meta = data['meta']

            expanded_assembly = []
            voxel_nums, atlases, filter_strategies, atlas_selections, atlas_filter_lower, rois = [], [], [], [], [], []
            for voxel_num in range(assembly.shape[1]):
                for atlas_iter, atlas_selection in enumerate(
                        neuroid_meta['atlases'][0, 0][:, 0]):
                    multimask = neuroid_meta['roiMultimask'][0,
                                                             0][atlas_iter,
                                                                0][voxel_num,
                                                                   0]
                    if np.isnan(multimask):
                        continue
                    atlas_selection = atlas_selection[0].split('_')
                    filter_strategy = None if len(
                        atlas_selection) != 3 else atlas_selection[1]
                    filter_lower = re.match(r'from([0-9]{2})to100prcnt',
                                            atlas_selection[-1])
                    atlas_filter_lower.append(int(filter_lower.group(1)))
                    atlas, selection = atlas_selection[0], atlas_selection[-1]
                    atlases.append(atlas)
                    filter_strategies.append(filter_strategy)
                    atlas_selections.append(selection)
                    multimask = int(
                        multimask
                    ) - 1  # Matlab 1-based to Python 0-based indexing
                    rois.append(neuroid_meta['rois'][0, 0][atlas_iter,
                                                           0][multimask, 0][0])
                    voxel_nums.append(voxel_num)
                    expanded_assembly.append(assembly[:, voxel_num])
            # ensure all are mapped
            assert set(voxel_nums) == set(range(
                assembly.shape[1])), "not all voxels mapped"
            # add indices
            assembly = np.stack(expanded_assembly).T
            assert assembly.shape[1] == len(atlases) == len(
                atlas_selections) == len(rois)
            indices_in_3d = neuroid_meta['indicesIn3D'][0, 0][:, 0]
            indices_in_3d = [
                indices_in_3d[voxel_num] for voxel_num in voxel_nums
            ]
            # add coords
            col_to_coords = np.array([
                neuroid_meta['colToCoord'][0, 0][voxel_num]
                for voxel_num in voxel_nums
            ])

            # put it all together
            assembly = NeuroidAssembly(
                assembly,
                coords={
                    **{
                        coord: (dims, value)
                        for coord, dims, value in walk_coords(
                            reference_data.sel(experiment=experiment_name)['presentation'])
                    },
                    **{
                        'experiment': ('presentation', [experiment_name] * assembly.shape[0]),
                        'subject': ('neuroid', [subject] * assembly.shape[1]),
                        'voxel_num': ('neuroid', voxel_nums),
                        'atlas': ('neuroid', atlases),
                        'filter_strategy': ('neuroid', filter_strategies),
                        'atlas_selection': ('neuroid', atlas_selections),
                        'atlas_selection_lower': ('neuroid', atlas_filter_lower),
                        'roi': ('neuroid', rois),
                        'indices_in_3d': ('neuroid', indices_in_3d),
                        'col_to_coord_1': ('neuroid', col_to_coords[:, 0]),
                        'col_to_coord_2': ('neuroid', col_to_coords[:, 1]),
                        'col_to_coord_3': ('neuroid', col_to_coords[:, 2]),
                    }
                },
                dims=['presentation', 'neuroid'])
            assembly['neuroid_id'] = 'neuroid', _build_id(
                assembly, ['subject', 'voxel_num'])
            subject_assemblies.append(assembly)
        assembly = merge_data_arrays(subject_assemblies)
        assemblies.append(assembly)

    _logger.debug(f"Merging {len(assemblies)} assemblies")
    assembly = merge_data_arrays(assemblies)
    assembly.attrs['version'] = version

    _logger.debug("Creating StimulusSet")
    assembly.attrs['stimulus_set'] = reference_data.stimulus_set
    return assembly