def load_time_meta(): data_dir = neural_data_dir / 'StoriesData_Dec2018' / 'stories_textgridsbyJeanne' files = data_dir.glob("*TextGrid*") time_to_words = [] for file in files: textgrid = TextGrid.load(file) words = [tier for tier in textgrid.tiers if tier.nameid == 'words'][0] rows = defaultdict(list) for (time_start, time_end, word) in words.simple_transcript: rows['time_start'].append(float(time_start)) rows['time_end'].append(float(time_end)) rows['word'].append(word) story_index = int(file.stem) story = stories_meta.sel(number=story_index).values story = next(iter(set(story))) # Boar was read twice rows = DataArray(rows['word'], coords={ 'filepath': ('time_bin', [] * len(rows['word'])), 'story': ('time_bin', [story] * len(rows['word'])), 'time_start': ('time_bin', rows['time_start']), 'time_end': ('time_bin', rows['time_end']), }, dims=['time_bin']) gather_indexes(rows) time_to_words.append(rows) time_to_words = merge_data_arrays(time_to_words) return time_to_words
def _merge_voxel_meta(data, meta, bold_shift_seconds): data_missing = set(meta['story'].values) - set(data['story'].values) if data_missing: warnings.warn(f"Stories missing from the data: {data_missing}") meta_missing = set(data['story'].values) - set(meta['story'].values) if meta_missing: warnings.warn(f"Stories missing from the meta: {meta_missing}") ignored_words = [None, '', '<s>', '</s>', '<s'] annotated_data = [] for story in tqdm(ordered_set(data['story'].values), desc='merge meta'): if story not in meta['story'].values: continue story_meta = meta.sel(story=story) story_meta = story_meta.sortby('time_end') story_data = data.sel(story=story).stack(timepoint=['timepoint_value']) story_data = story_data.sortby('timepoint_value') timepoints = story_data['timepoint_value'].values.tolist() assert is_sorted(timepoints) timepoints = [ timepoint - bold_shift_seconds for timepoint in timepoints ] sentences = [] last_timepoint = -np.inf for timepoint in timepoints: if last_timepoint >= max(story_meta['time_end'].values): break if timepoint <= 0: sentences.append(None) continue # ignore fixation period timebin_meta = [ last_timepoint < end <= timepoint for end in story_meta['time_end'].values ] timebin_meta = story_meta[{'time_bin': timebin_meta}] sentence = ' '.join(word.strip() for word in timebin_meta.values if word not in ignored_words) sentence = sentence.lower().strip() # quick-fixes if story == 'Boar' and sentence == 'interactions the the': # Boar duplicate sentence = 'interactions the' if story == 'KingOfBirds' and sentence == 'the fact that the larger': # missing word in TextGrid sentence = 'earth ' + sentence if story == 'MrSticky' and sentence == 'worry don\'t worry i went extra slowly since it\'s': sentence = 'don\'t worry i went extra slowly since it\'s' sentences.append(sentence) last_timepoint = timebin_meta['time_end'].values[-1] sentence_index = [ i for i, sentence in enumerate(sentences) if sentence ] sentences = np.array(sentences)[sentence_index] if story not in ['Boar', 'KingOfBirds', 'MrSticky']: # ignore quick-fixes annotated_sentence = ' '.join(sentences) meta_sentence = ' '.join(word.strip() for word in story_meta.values if word not in ignored_words) \ .lower().strip() assert annotated_sentence == meta_sentence # re-interpret timepoints as stimuli coords = {} for coord_name, dims, coord_value in walk_coords(story_data): dims = [ dim if not dim.startswith('timepoint') else 'presentation' for dim in dims ] # discard the timepoints for which the stimulus did not change (empty word) coord_value = coord_value if not array_is_element( dims, 'presentation') else coord_value[sentence_index] coords[coord_name] = dims, coord_value coords = { **coords, **{ 'stimulus_sentence': ('presentation', sentences) } } story_data = story_data[{ dim: slice(None) if dim != 'timepoint' else sentence_index for dim in story_data.dims }] dims = [ dim if not dim.startswith('timepoint') else 'presentation' for dim in story_data.dims ] story_data = xr.DataArray(story_data.values, coords=coords, dims=dims) story_data['story'] = 'presentation', [story] * len( story_data['presentation']) gather_indexes(story_data) annotated_data.append(story_data) annotated_data = merge_data_arrays(annotated_data) return annotated_data
def load_voxel_data(bold_shift_seconds=4): data = load_filtered_voxel_timepoints() gather_indexes(data) meta = load_time_meta() annotated_data = _merge_voxel_meta(data, meta, bold_shift_seconds) return annotated_data
def load_filtered_voxel_timepoints(): data = load_voxel_timepoints() data = data.sel(threshold='from90to100') gather_indexes(data) data = data.sel(subject_nStories=8) return data
'time_with_fixation': ('story', [338, 318, 394, 396, 302, 410, 348, 394, 388, 422, 338]), 'time_without_fixation': ('story', [ 5 * 60 + 6, 4 * 60 + 46, 6 * 60 + 2, 6 * 60 + 4, 4 * 60 + 30, 6 * 60 + 18, 5 * 60 + 16, 6 * 60 + 2, 5 * 60 + 56, 6 * 60 + 30, 5 * 60 + 6 ]), 'recording_timepoints': ('story', [169, 159, 197, 198, 151, 205, 174, 197, 194, 211, 169]) }, dims=['story']) stories_meta['story_index'] = 'story', [ ".".join([str(value) for value in values]) for values in zip( *[stories_meta[coord].values for coord in ['story_name', 'reader']]) ] gather_indexes(stories_meta) def _merge_voxel_meta(data, meta, bold_shift_seconds): data_missing = set(meta['story'].values) - set(data['story'].values) if data_missing: warnings.warn(f"Stories missing from the data: {data_missing}") meta_missing = set(data['story'].values) - set(meta['story'].values) if meta_missing: warnings.warn(f"Stories missing from the meta: {meta_missing}") ignored_words = [None, '', '<s>', '</s>', '<s'] annotated_data = [] for story in tqdm(ordered_set(data['story'].values), desc='merge meta'): if story not in meta['story'].values: continue