def load_time_meta(): data_dir = neural_data_dir / 'StoriesData_Dec2018' / 'stories_textgridsbyJeanne' files = data_dir.glob("*TextGrid*") time_to_words = [] for file in files: textgrid = TextGrid.load(file) words = [tier for tier in textgrid.tiers if tier.nameid == 'words'][0] rows = defaultdict(list) for (time_start, time_end, word) in words.simple_transcript: rows['time_start'].append(float(time_start)) rows['time_end'].append(float(time_end)) rows['word'].append(word) story_index = int(file.stem) story = stories_meta.sel(number=story_index).values story = next(iter(set(story))) # Boar was read twice rows = DataArray(rows['word'], coords={ 'filepath': ('time_bin', [file.name] * len(rows['word'])), 'story': ('time_bin', [story] * len(rows['word'])), 'time_start': ('time_bin', rows['time_start']), 'time_end': ('time_bin', rows['time_end']), }, dims=['time_bin']) gather_indexes(rows) time_to_words.append(rows) time_to_words = merge_data_arrays(time_to_words) return time_to_words
def listen_to(candidate, stimulus_set, reset_column='story', average_sentence=True): """ Pass a `stimulus_set` through a model `candidate`. Operates on a sentence-based `stimulus_set`. """ activations = [] for story in ordered_set(stimulus_set[reset_column].values): story_stimuli = stimulus_set[stimulus_set[reset_column] == story] story_stimuli.name = f"{stimulus_set.name}-{story}" story_activations = candidate(stimuli=story_stimuli, average_sentence=average_sentence) activations.append(story_activations) model_activations = merge_data_arrays(activations) # merging does not maintain stimulus order. the following orders again idx = [ model_activations['stimulus_id'].values.tolist().index(stimulus_id) for stimulus_id in itertools.chain.from_iterable( s['stimulus_id'].values for s in activations) ] assert len( set(idx)) == len(idx), "Found duplicate indices to order activations" model_activations = model_activations[{'presentation': idx}] return model_activations
def from_paths(self, *args, **kwargs): raw_activations = super(TemporalExtractor, self).from_paths(*args, **kwargs) # introduce time dimension regions = defaultdict(list) for layer in set(raw_activations['layer'].values): match = re.match(r'(([^-]*)\..*|logits|avgpool)-t([0-9]+)', layer) region, timestep = match.group(2) if match.group(2) else match.group(1), match.group(3) stripped_layer = match.group(1) regions[region].append((layer, stripped_layer, timestep)) activations = {} for region, time_layers in regions.items(): for (full_layer, stripped_layer, timestep) in time_layers: region_time_activations = raw_activations.sel(layer=full_layer) region_time_activations['layer'] = 'neuroid', [stripped_layer] * len(region_time_activations['neuroid']) activations[(region, timestep)] = region_time_activations for key, key_activations in activations.items(): region, timestep = key key_activations['region'] = 'neuroid', [region] * len(key_activations['neuroid']) activations[key] = NeuroidAssembly([key_activations.values], coords={ **{coord: (dims, values) for coord, dims, values in walk_coords(activations[key]) if coord != 'neuroid_id'}, # otherwise, neuroid dim will be as large as before with nans **{'time_step': [int(timestep)]} }, dims=['time_step'] + list(key_activations.dims)) activations = list(activations.values()) activations = merge_data_arrays(activations) # rebuild neuroid_id without timestep neuroid_id = [".".join([f"{value}" for value in values]) for values in zip(*[ activations[coord].values for coord in ['model', 'region', 'neuroid_num']])] activations['neuroid_id'] = 'neuroid', neuroid_id return activations
def get_activations_for_sentence(model_name, layers, sentences): model = load_model(model_name) activations = [] for sentence in sentences: sentence_activations = model.get_activations([sentence], layers=layers) activations.append(sentence_activations) activations = merge_data_arrays(activations) return activations
def extrapolate_neuroid(self, ceilings): # figure out how many extrapolation x points we have. E.g. for Pereira, not all combinations are possible subject_subsamples = list(sorted(set(ceilings['num_subjects'].values))) rng = RandomState(0) bootstrap_params = [] for bootstrap in range(self.num_bootstraps): bootstrapped_scores = [] for num_subjects in subject_subsamples: num_scores = ceilings.sel(num_subjects=num_subjects) # the sub_subjects dimension creates nans, get rid of those num_scores = num_scores.dropna(f'sub_{self.subject_column}') assert set(num_scores.dims) == {f'sub_{self.subject_column}', 'split'} or \ set(num_scores.dims) == {f'sub_{self.subject_column}'} # choose from subject subsets and the splits therein, with replacement for variance choices = num_scores.values.flatten() bootstrapped_score = rng.choice(choices, size=len(choices), replace=True) bootstrapped_scores.append(np.mean(bootstrapped_score)) try: params = self.fit(subject_subsamples, bootstrapped_scores) except RuntimeError: # optimal parameters not found params = [np.nan, np.nan] params = DataAssembly([params], coords={ 'bootstrap': [bootstrap], 'param': ['v0', 'tau0'] }, dims=['bootstrap', 'param']) bootstrap_params.append(params) bootstrap_params = merge_data_arrays(bootstrap_params) # find endpoint and error asymptote_threshold = .0005 interpolation_xs = np.arange(1000) ys = np.array([ v(interpolation_xs, *params) for params in bootstrap_params.values if not np.isnan(params).any() ]) median_ys = np.median(ys, axis=0) diffs = np.diff(median_ys) end_x = np.where(diffs < asymptote_threshold)[0].min( ) # first x where increase smaller than threshold # put together center = np.median(np.array(bootstrap_params)[:, 0]) error = ci_error(ys[:, end_x], center=center) score = Score( [center] + list(error), coords={'aggregation': ['center', 'error_low', 'error_high']}, dims=['aggregation']) score.attrs['raw'] = ceilings score.attrs['bootstrapped_params'] = bootstrap_params score.attrs['endpoint_x'] = DataAssembly(end_x) return score
def predict(self, source): predictions = [] for time_bin_start, time_bin_end in source['time_bin'].values: time_source = source.sel(time_bin=(time_bin_start, time_bin_end)) prediction = self._regression.predict(time_source) prediction = prediction.expand_dims('time_bin_start').expand_dims( 'time_bin_end') prediction['time_bin_start'] = [time_bin_start] prediction['time_bin_end'] = [time_bin_end] prediction = prediction.stack( time_bin=['time_bin_start', 'time_bin_end']) predictions.append(prediction) return merge_data_arrays(predictions)
def cross_correlation(prediction, target, cross, correlation): assert (prediction[cross] == target[cross]).all() scores = [] coords = [coord for coord, dims, values in walk_coords(target[cross])] for cross_value in target[cross].values: _prediction = prediction.sel(**{cross: cross_value}) _target = target.sel(**{cross: cross_value}) score = correlation(_prediction, _target) for coord, coord_value in zip(coords, cross_value): score = score.expand_dims(coord) score[coord] = [coord_value] score = score.stack(**{cross: coords}) scores.append(score) score = merge_data_arrays(scores) score = apply_aggregate(lambda score: score.mean(cross), score) return score
def look_at(self, stimuli, number_of_trials=1): responses = self._layer_model.look_at(stimuli, number_of_trials=number_of_trials) time_responses = [] self._logger.debug(f'Repeating single assembly across time bins {self._time_bins}') for time_bin in self._time_bins: time_bin = time_bin if not isinstance(time_bin, np.ndarray) else time_bin.tolist() time_bin_start, time_bin_end = time_bin bin_responses = responses.expand_dims('time_bin_start').expand_dims('time_bin_end') bin_responses['time_bin_start'] = [time_bin_start] bin_responses['time_bin_end'] = [time_bin_end] bin_responses = bin_responses.stack(time_bin=['time_bin_start', 'time_bin_end']) time_responses.append(bin_responses) responses = merge_data_arrays(time_responses) if len(self._time_bins) == 1: responses = responses.squeeze('time_bin') responses = fix_timebin_naming(responses) return responses
def merge(cls, *scores, ignore_exceptions=False): """ Merges the raw values in addition to the score assemblies. """ try: result = merge_data_arrays(scores) raws = [ score.attrs[cls.RAW_VALUES_KEY] for score in scores if cls.RAW_VALUES_KEY in score.attrs ] if len(raws) > 0: raw = Score.merge(*raws, ignore_exceptions=True) result.attrs[cls.RAW_VALUES_KEY] = raw except Exception as e: if ignore_exceptions: warnings.warn("failed to merge raw values: " + str(e)) return None else: raise e return result
def read_words(candidate, stimulus_set, reset_column='sentence_id', copy_columns=(), average_sentence=False): """ Pass a `stimulus_set` through a model `candidate`. In contrast to the `listen_to` function, this function operates on a word-based `stimulus_set`. """ # Input: stimulus_set = pandas df, col 1 with sentence ID and 2nd col as word. activations = [] for i, reset_id in enumerate(ordered_set( stimulus_set[reset_column].values)): part_stimuli = stimulus_set[stimulus_set[reset_column] == reset_id] # stimulus_ids = part_stimuli['stimulus_id'] sentence_stimuli = StimulusSet({ 'sentence': ' '.join(part_stimuli['word']), reset_column: list(set(part_stimuli[reset_column])) }) sentence_stimuli.name = f"{stimulus_set.name}-{reset_id}" sentence_activations = candidate(stimuli=sentence_stimuli, average_sentence=average_sentence) for column in copy_columns: sentence_activations[column] = ('presentation', part_stimuli[column]) activations.append(sentence_activations) model_activations = merge_data_arrays(activations) # merging does not maintain stimulus order. the following orders again idx = [ model_activations['stimulus_id'].values.tolist().index(stimulus_id) for stimulus_id in itertools.chain.from_iterable( s['stimulus_id'].values for s in activations) ] assert len( set(idx)) == len(idx), "Found duplicate indices to order activations" model_activations = model_activations[{'presentation': idx}] return model_activations
def _merge_voxel_meta(data, meta, bold_shift_seconds): data_missing = set(meta['story'].values) - set(data['story'].values) if data_missing: warnings.warn(f"Stories missing from the data: {data_missing}") meta_missing = set(data['story'].values) - set(meta['story'].values) if meta_missing: warnings.warn(f"Stories missing from the meta: {meta_missing}") ignored_words = [None, '', '<s>', '</s>', '<s'] annotated_data = [] for story in tqdm(ordered_set(data['story'].values), desc='merge meta'): if story not in meta['story'].values: continue story_meta = meta.sel(story=story) story_meta = story_meta.sortby('time_end') story_data = data.sel(story=story).stack(timepoint=['timepoint_value']) story_data = story_data.sortby('timepoint_value') timepoints = story_data['timepoint_value'].values.tolist() assert is_sorted(timepoints) timepoints = [ timepoint - bold_shift_seconds for timepoint in timepoints ] sentences = [] last_timepoint = -np.inf for timepoint in timepoints: if last_timepoint >= max(story_meta['time_end'].values): break if timepoint <= 0: sentences.append(None) continue # ignore fixation period timebin_meta = [ last_timepoint < end <= timepoint for end in story_meta['time_end'].values ] timebin_meta = story_meta[{'time_bin': timebin_meta}] sentence = ' '.join(word.strip() for word in timebin_meta.values if word not in ignored_words) sentence = sentence.lower().strip() # quick-fixes if story == 'Boar' and sentence == 'interactions the the': # Boar duplicate sentence = 'interactions the' if story == 'KingOfBirds' and sentence == 'the fact that the larger': # missing word in TextGrid sentence = 'earth ' + sentence if story == 'MrSticky' and sentence == 'worry don\'t worry i went extra slowly since it\'s': sentence = 'don\'t worry i went extra slowly since it\'s' sentences.append(sentence) last_timepoint = timebin_meta['time_end'].values[-1] sentence_index = [ i for i, sentence in enumerate(sentences) if sentence ] sentences = np.array(sentences)[sentence_index] if story not in ['Boar', 'KingOfBirds', 'MrSticky']: # ignore quick-fixes annotated_sentence = ' '.join(sentences) meta_sentence = ' '.join(word.strip() for word in story_meta.values if word not in ignored_words) \ .lower().strip() assert annotated_sentence == meta_sentence # re-interpret timepoints as stimuli coords = {} for coord_name, dims, coord_value in walk_coords(story_data): dims = [ dim if not dim.startswith('timepoint') else 'presentation' for dim in dims ] # discard the timepoints for which the stimulus did not change (empty word) coord_value = coord_value if not array_is_element( dims, 'presentation') else coord_value[sentence_index] coords[coord_name] = dims, coord_value coords = { **coords, **{ 'stimulus_sentence': ('presentation', sentences) } } story_data = story_data[{ dim: slice(None) if dim != 'timepoint' else sentence_index for dim in story_data.dims }] dims = [ dim if not dim.startswith('timepoint') else 'presentation' for dim in story_data.dims ] story_data = xr.DataArray(story_data.values, coords=coords, dims=dims) story_data['story'] = 'presentation', [story] * len( story_data['presentation']) gather_indexes(story_data) annotated_data.append(story_data) annotated_data = merge_data_arrays(annotated_data) return annotated_data
def load_Pereira2018_Blank_languageresiduals(): # hijack the corresponding encoding benchmark to regress, but then store residuals instead of correlate from neural_nlp.benchmarks.neural import PereiraEncoding benchmark = PereiraEncoding() assembly, cross = benchmark._target_assembly, benchmark._cross residuals = [] def store_residuals(nonlanguage_prediction, language_target): residual = language_target - nonlanguage_prediction residuals.append(residual) return Score([0], coords={'neuroid_id': ('neuroid', [0])}, dims=['neuroid']) # dummy score pseudo_metric = CrossRegressedCorrelation(regression=linear_regression( xarray_kwargs=dict(stimulus_coord='stimulus_id')), correlation=store_residuals, crossvalidation_kwargs=dict( splits=5, kfold=True, split_coord='stimulus_id', stratification_coord=None)) # separate language from non-language networks language_assembly = assembly[{ 'neuroid': [ atlas in ['DMN', 'MD', 'language'] for atlas in assembly['atlas'].values ] }] nonlanguage_assembly = assembly[{ 'neuroid': [ atlas in ['visual', 'auditory'] for atlas in assembly['atlas'].values ] }] # run def apply_cross(source_assembly, target_assembly): # filter experiment source_assembly = source_assembly[{ 'presentation': [ stimulus_id in target_assembly['stimulus_id'].values for stimulus_id in source_assembly['stimulus_id'].values ] }] assert all(source_assembly['stimulus_id'].values == target_assembly['stimulus_id'].values) # filter subjects that have not done this experiment source_assembly = source_assembly.dropna('neuroid') # for the target assembly, it's going to become awkward if we just drop those neuroids. # instead, we set them to zero which makes for simple zero regression weights. target_assembly = target_assembly.fillna(0) # this will regress from joint visual+auditory neural space to one of the language networks return pseudo_metric(source_assembly, target_assembly) cross(language_assembly, apply=lambda cross_assembly: apply_cross(nonlanguage_assembly, cross_assembly)) # combine residuals assert len( residuals ) == 5 * 2 * 3 # 5-fold CV, 2 experiments, 3 language brain networks # ensure uniqueness neuroid_ids, stimulus_ids = [], [] for residual in residuals: neuroid_ids += residual['neuroid_id'].values.tolist() stimulus_ids += residual['stimulus_id'].values.tolist() assert len(neuroid_ids) == len(language_assembly['neuroid']) * 5 * 2 assert len(set(neuroid_ids)) == len( set(language_assembly['neuroid_id'].values)) assert len(stimulus_ids) == len(language_assembly['presentation']) * 3 assert len(set(stimulus_ids)) == len( set(language_assembly['stimulus_id'].values)) residuals = merge_data_arrays(residuals) residuals = type(language_assembly)(residuals) residuals.attrs['stimulus_set'] = assembly.stimulus_set return residuals
def load_Pereira2018(): data_dir = neural_data_dir / "Pereira2018" experiment2, experiment3 = "243sentences.mat", "384sentences.mat" stimuli = {} # experiment -> stimuli assemblies = [] subject_directories = [d for d in data_dir.iterdir() if d.is_dir()] for subject_directory in tqdm(subject_directories, desc="subjects"): for experiment_filename in [experiment2, experiment3]: data_file = subject_directory / f"examples_{experiment_filename}" if not data_file.is_file(): _logger.debug( f"{subject_directory} does not contain {experiment_filename}" ) continue data = scipy.io.loadmat(str(data_file)) # assembly assembly = data['examples'] meta = data['meta'] assembly = NeuroidAssembly( assembly, coords={ 'experiment': ('presentation', [os.path.splitext(experiment_filename)[0]] * assembly.shape[0]), 'stimulus_num': ('presentation', list(range(assembly.shape[0]))), 'passage_index': ('presentation', data['labelsPassageForEachSentence'][:, 0]), 'passage_label': ('presentation', [ data['keyPassages'][index - 1, 0][0] for index in data['labelsPassageForEachSentence'][:, 0] ]), 'passage_category': ('presentation', [ data['keyPassageCategory'] [0, data['labelsPassageCategory'][index - 1, 0] - 1][0][0] for index in data['labelsPassageForEachSentence'] ]), 'subject': ('neuroid', [subject_directory.name] * assembly.shape[1]), 'voxel_num': ('neuroid', list(range(assembly.shape[1]))), 'AAL_roi_index': ('neuroid', meta[0][0]['roiMultimaskAAL'][:, 0]), }, dims=['presentation', 'neuroid']) stimulus_id = _build_id(assembly, ['experiment', 'stimulus_num']) assembly['stimulus_id'] = 'presentation', stimulus_id # set story for compatibility assembly['story'] = 'presentation', _build_id( assembly, ['experiment', 'passage_category']) assembly['neuroid_id'] = 'neuroid', _build_id( assembly, ['subject', 'voxel_num']) assemblies.append(assembly) # stimuli if experiment_filename not in stimuli: sentences = data['keySentences'] sentences = [sentence[0] for sentence in sentences[:, 0]] stimuli[experiment_filename] = { 'sentence': sentences, 'sentence_num': list(range(len(sentences))), 'stimulus_id': stimulus_id, 'experiment': assembly['experiment'].values, 'story': assembly['story'].values, } for copy_coord in [ 'experiment', 'story', 'passage_index', 'passage_label', 'passage_category' ]: stimuli[experiment_filename][copy_coord] = assembly[ copy_coord].values _logger.debug(f"Merging {len(assemblies)} assemblies") assembly = merge_data_arrays(assemblies) _logger.debug("Creating StimulusSet") combined_stimuli = {} for key in stimuli[experiment2]: combined_stimuli[key] = np.concatenate( (stimuli[experiment2][key], stimuli[experiment3][key])) stimuli = StimulusSet(combined_stimuli) stimuli.name = "Pereira2018" assembly.attrs['stimulus_set'] = stimuli return assembly
def load_Pereira2018_Blank(version='base'): reference_data = load_Pereira2018() data_dir = neural_data_dir / ("Pereira2018_Blank" + ("_langonly" if version != 'base' else "")) experiments = {'n72': "243sentences", 'n96': "384sentences"} assemblies = [] subjects = [ '018', '199', '215', '288', '289', '296', '343', '366', '407', '426' ] for subject in tqdm(subjects, desc="subjects"): subject_assemblies = [] for experiment_filepart, experiment_name in experiments.items(): filepath = data_dir / f"{'ICA_' if version != 'base' else ''}" \ f"{subject}_complang_passages_{experiment_filepart}_persent.mat" if not filepath.is_file(): _logger.debug( f"Subject {subject} did not run {experiment_name}: {filepath} does not exist" ) continue data = scipy.io.loadmat(str(filepath)) if version != 'base': data = data['x'][0, 0] # construct assembly assembly = data['data' if version == 'base' else f'data{version}'] neuroid_meta = data['meta'] expanded_assembly = [] voxel_nums, atlases, filter_strategies, atlas_selections, atlas_filter_lower, rois = [], [], [], [], [], [] for voxel_num in range(assembly.shape[1]): for atlas_iter, atlas_selection in enumerate( neuroid_meta['atlases'][0, 0][:, 0]): multimask = neuroid_meta['roiMultimask'][0, 0][atlas_iter, 0][voxel_num, 0] if np.isnan(multimask): continue atlas_selection = atlas_selection[0].split('_') filter_strategy = None if len( atlas_selection) != 3 else atlas_selection[1] filter_lower = re.match(r'from([0-9]{2})to100prcnt', atlas_selection[-1]) atlas_filter_lower.append(int(filter_lower.group(1))) atlas, selection = atlas_selection[0], atlas_selection[-1] atlases.append(atlas) filter_strategies.append(filter_strategy) atlas_selections.append(selection) multimask = int( multimask ) - 1 # Matlab 1-based to Python 0-based indexing rois.append(neuroid_meta['rois'][0, 0][atlas_iter, 0][multimask, 0][0]) voxel_nums.append(voxel_num) expanded_assembly.append(assembly[:, voxel_num]) # ensure all are mapped assert set(voxel_nums) == set(range( assembly.shape[1])), "not all voxels mapped" # add indices assembly = np.stack(expanded_assembly).T assert assembly.shape[1] == len(atlases) == len( atlas_selections) == len(rois) indices_in_3d = neuroid_meta['indicesIn3D'][0, 0][:, 0] indices_in_3d = [ indices_in_3d[voxel_num] for voxel_num in voxel_nums ] # add coords col_to_coords = np.array([ neuroid_meta['colToCoord'][0, 0][voxel_num] for voxel_num in voxel_nums ]) # put it all together assembly = NeuroidAssembly( assembly, coords={ **{ coord: (dims, value) for coord, dims, value in walk_coords( reference_data.sel(experiment=experiment_name)['presentation']) }, **{ 'experiment': ('presentation', [experiment_name] * assembly.shape[0]), 'subject': ('neuroid', [subject] * assembly.shape[1]), 'voxel_num': ('neuroid', voxel_nums), 'atlas': ('neuroid', atlases), 'filter_strategy': ('neuroid', filter_strategies), 'atlas_selection': ('neuroid', atlas_selections), 'atlas_selection_lower': ('neuroid', atlas_filter_lower), 'roi': ('neuroid', rois), 'indices_in_3d': ('neuroid', indices_in_3d), 'col_to_coord_1': ('neuroid', col_to_coords[:, 0]), 'col_to_coord_2': ('neuroid', col_to_coords[:, 1]), 'col_to_coord_3': ('neuroid', col_to_coords[:, 2]), } }, dims=['presentation', 'neuroid']) assembly['neuroid_id'] = 'neuroid', _build_id( assembly, ['subject', 'voxel_num']) subject_assemblies.append(assembly) assembly = merge_data_arrays(subject_assemblies) assemblies.append(assembly) _logger.debug(f"Merging {len(assemblies)} assemblies") assembly = merge_data_arrays(assemblies) assembly.attrs['version'] = version _logger.debug("Creating StimulusSet") assembly.attrs['stimulus_set'] = reference_data.stimulus_set return assembly
def look_at_temporal(self, stimuli): responses = self.activations_model(stimuli, layers=self.recording_layers) # map time if hasattr(self, 'recording_target'): regions = set([self.recording_target]) else: regions = set(responses['region'].values) if len(regions) > 1: raise NotImplementedError( "cannot handle more than one simultaneous region") region = list(regions)[0] time_bins = [ self.time_mapping[region][timestep] if timestep in self.time_mapping[region] else (None, None) for timestep in responses['time_step'].values ] responses['time_bin_start'] = 'time_step', [ time_bin[0] for time_bin in time_bins ] responses['time_bin_end'] = 'time_step', [ time_bin[1] for time_bin in time_bins ] responses = NeuroidAssembly(responses.rename({'time_step': 'time_bin'})) responses = responses[{ 'time_bin': [ not np.isnan(time_start) for time_start in responses['time_bin_start'] ] }] # select time time_responses = [] for time_bin in tqdm(self.recording_time_bins, desc='CORnet-time to recording time'): time_bin = time_bin if not isinstance( time_bin, np.ndarray) else time_bin.tolist() time_bin_start, time_bin_end = time_bin nearest_start = find_nearest(responses['time_bin_start'].values, time_bin_start) bin_responses = responses.sel(time_bin_start=nearest_start) bin_responses = NeuroidAssembly( bin_responses.values, coords={ **{ coord: (dims, values) for coord, dims, values in walk_coords(bin_responses) if coord not in [ 'time_bin_level_0', 'time_bin_end' ] }, **{ 'time_bin_start': ('time_bin', [time_bin_start]), 'time_bin_end': ('time_bin', [time_bin_end]) } }, dims=bin_responses.dims) time_responses.append(bin_responses) responses = merge_data_arrays(time_responses) responses = fix_timebin_naming( responses) # work around xarray merge bug introduced in 0.16.2 return responses