Пример #1
0
def read_words(candidate, stimulus_set, reset_column='sentence_id', copy_columns=(), average_sentence=False):
    """
    Pass a `stimulus_set` through a model `candidate`.
    In contrast to the `listen_to` function, this function operates on a word-based `stimulus_set`.
    """
    # Input: stimulus_set = pandas df, col 1 with sentence ID and 2nd col as word.
    activations = []
    for i, reset_id in enumerate(ordered_set(stimulus_set[reset_column].values)):
        part_stimuli = stimulus_set[stimulus_set[reset_column] == reset_id]
        # stimulus_ids = part_stimuli['stimulus_id']
        sentence_stimuli = StimulusSet({'sentence': ' '.join(part_stimuli['word']),
                                        reset_column: list(set(part_stimuli[reset_column]))})
        sentence_stimuli.name = f"{stimulus_set.name}-{reset_id}"
        sentence_activations = candidate(stimuli=sentence_stimuli, average_sentence=average_sentence)
        for column in copy_columns:
            sentence_activations[column] = ('presentation', part_stimuli[column])
        activations.append(sentence_activations)
    model_activations = merge_data_arrays(activations)
    # merging does not maintain stimulus order. the following orders again
    idx = [model_activations['stimulus_id'].values.tolist().index(stimulus_id) for stimulus_id in
           itertools.chain.from_iterable(s['stimulus_id'].values for s in activations)]
    assert len(set(idx)) == len(idx), "Found duplicate indices to order activations"
    model_activations = model_activations[{'presentation': idx}]

    return model_activations
Пример #2
0
 def _load_rdms(self, roi_filter='from90to100', bold_shift_seconds=4):
     assemblies = {}
     for story in ['Boar', 'KingOfBirds', 'Elvis', 'HighSchool', 'MatchstickSeller']:
         assembly = load_rdm_sentences(story=story, roi_filter=roi_filter, bold_shift_seconds=bold_shift_seconds)
         assembly = assembly.mean(dim='subject')
         stimulus_set_identifier = f'naturalistic-neural-reduced.{story}'
         stimulus_set = load_stimuli(stimulus_set_identifier)
         stimulus_set = StimulusSet({'sentence': stimulus_set})
         stimulus_set.name = stimulus_set_identifier
         assembly.attrs['stimulus_set'] = stimulus_set
         assemblies[story] = assembly
     return assemblies
Пример #3
0
def read_words(candidate, stimulus_set): # This is a new version of the listen_to_stories function
    # Input: stimulus_set = pandas df, col 1 with sentence ID and 2nd col as word.
    activations = []
    for i, sentence_id in enumerate(ordered_set(stimulus_set['sentence_id'].values)):
        sentence_stimuli = stimulus_set[stimulus_set['sentence_id'] == sentence_id]
        sentence_stimuli = StimulusSet({'sentence': ' '.join(sentence_stimuli['word']),
                                        'sentence_id': list(set(sentence_stimuli['sentence_id']))})
        sentence_stimuli.name = f"{stimulus_set.name}-{sentence_id}"
        sentence_activations = candidate(stimuli=sentence_stimuli)
        sentence_activations['stimulus_id'] = ('presentation', 8 * i + np.arange(0, 8))
        sentence_activations['sentence_id'] = ('presentation', [sentence_id] * 8)
        activations.append(sentence_activations)
    model_activations = merge_data_arrays(activations)
    # merging does not maintain stimulus order. the following orders again
    idx = [model_activations['stimulus_id'].values.tolist().index(stimulus_id) for stimulus_id in
           itertools.chain.from_iterable(s['stimulus_id'].values for s in activations)]
    assert len(set(idx)) == len(idx), "Found duplicate indices to order activations"
    model_activations = model_activations[{'presentation': idx}]
    
    return model_activations
Пример #4
0
def load_naturalStories():
    ressources_dir = Path(__file__).parent.parent.parent / 'ressources'
    data_path = ressources_dir / 'neural_data' / 'naturalstories_RTS'
    data_file = data_path / 'processed_RTs.csv'
    _logger.info(f'Data file: {data_file}')

    # get data
    data = pd.read_csv(data_file)

    # get unique word identifier tuples and order in order of stories
    item_ID = np.array(data['item'])
    zone_ID = np.array(data['zone'])
    zpd_lst = list(zip(item_ID, zone_ID))
    unique_zpd_lst = list(set(zpd_lst))
    unique_zpd_lst = sorted(unique_zpd_lst, key=lambda tup: (tup[0], tup[1]))

    # get unique WorkerIds
    subjects = data.WorkerId.unique()

    # ====== create matrix ======
    r_dim = len(unique_zpd_lst)
    c_dim = len(subjects)

    # default value for a subject's not having an RT for a story/word is NaN
    matrix = np.empty((r_dim, c_dim))
    matrix[:] = np.nan

    # set row and column indices for matrix
    r_indices = {unique_zpd_lst[i]: i for i in range(r_dim)}
    c_indices = {subjects[i]: i for i in range(c_dim)}

    # populate meta information dictionary for subjects xarray dimension
    metaInfo_subjects = {}

    for index, d in tqdm(data.iterrows(), total=len(data), desc='indices'):
        r = r_indices[(d['item'], d['zone'])]
        c = c_indices[d['WorkerId']]
        matrix[r][c] = d['RT']
        key = d['WorkerId']
        if key not in metaInfo_subjects:
            metaInfo_subjects[key] = (d['correct'], d['WorkTimeInSeconds'])

    matrix = np.array(matrix)

    # get subjects' metadata
    correct_meta = [v[0] for v in metaInfo_subjects.values()]
    WorkTimeInSeconds_meta = [v[1] for v in metaInfo_subjects.values()]

    # get metadata for presentation dimension
    word_df = pd.read_csv(f'{data_path}/all_stories.tok', sep='\t')
    voc_item_ID = np.array(word_df['item'])
    voc_zone_ID = np.array(word_df['zone'])
    voc_word = np.array(word_df['word'])

    # get sentence_IDs (finds 481 sentences)
    sentence_ID = []
    idx = 1
    for i, elm in enumerate(voc_word):
        sentence_ID.append(idx)
        if elm.endswith((".", "?", "!", ".'", "?'", "!'", ";'")):
            if i + 1 < len(voc_word):
                if not (voc_word[i + 1].islower() or voc_word[i] == "Mr."):
                    idx += 1

    # get IDs of words within a sentence
    word_within_a_sentence_ID = []
    idx = 0
    for i, elm in enumerate(voc_word):
        idx += 1
        word_within_a_sentence_ID.append(idx)
        if elm.endswith((".", "?", "!", ".'", "?'", "!'", ";'")):
            if i + 1 < len(voc_word):
                if not (voc_word[i + 1].islower() or voc_word[i] == "Mr."):
                    idx = 0
            else:
                idx = 0

    # stimulus_ID
    stimulus_ID = list(range(1, len(voc_word) + 1))

    # set df_stimulus_set for attributes
    df_stimulus_set = word_df[['word', 'item', 'zone']]
    df_stimulus_set = StimulusSet(df_stimulus_set)
    df_stimulus_set['story_id'] = df_stimulus_set['item']
    df_stimulus_set['stimulus_id'] = stimulus_ID
    df_stimulus_set['sentence_id'] = sentence_ID
    df_stimulus_set['word_id'] = voc_zone_ID
    df_stimulus_set['word_within_sentence_id'] = word_within_a_sentence_ID
    df_stimulus_set.name = 'naturalStories'

    # build xarray
    # voc_word = word
    # voc_item_ID = index of story (1-10)
    # voc_zone_ID = index of words within a story
    # sentence_ID = index of words within each sentence
    # stimulus_ID = unique index of word across all stories
    # subjects = WorkerIDs
    # correct_meta = number of correct answers in comprehension questions
    assembly = xr.DataArray(matrix,
                            dims=('presentation', 'subjects'),
                            coords={
                                'word': ('presentation', voc_word),
                                'story_id': ('presentation', voc_item_ID),
                                'word_id': ('presentation', voc_zone_ID),
                                'word_within_sentence_id':
                                ('presentation', word_within_a_sentence_ID),
                                'sentence_id': ('presentation', sentence_ID),
                                'stimulus_id': ('presentation', stimulus_ID),
                                'subject_id': ('subjects', subjects),
                                'correct': ('subjects', correct_meta),
                                'WorkTimeInSeconds':
                                ('subjects', WorkTimeInSeconds_meta)
                            })

    assembly.attrs[
        'stimulus_set'] = df_stimulus_set  # Add the stimulus_set dataframe
    return assembly
Пример #5
0
def _align_stimuli_recordings(stimulus_set, assembly):
    aligned_stimulus_set = []
    partial_sentences = assembly['stimulus_sentence'].values
    partial_sentences = [
        compare_ignore(sentence) for sentence in partial_sentences
    ]
    assembly_stimset = {}
    stimulus_set_index = 0

    stories = ordered_set(assembly['story'].values.tolist())
    for story in tqdm(sorted(stories),
                      desc='align stimuli',
                      total=len(stories)):
        story_partial_sentences = [
            (sentence, i) for i, (sentence, sentence_story) in enumerate(
                zip(partial_sentences, assembly['story'].values))
            if sentence_story == story
        ]

        story_stimuli = stimulus_set[stimulus_set['story'] == story]
        stimuli_story = ' '.join(story_stimuli['sentence'])
        stimuli_story_sentence_starts = [0] + [
            len(sentence) + 1 for sentence in story_stimuli['sentence']
        ]
        stimuli_story_sentence_starts = np.cumsum(
            stimuli_story_sentence_starts)
        assert ' '.join(s
                        for s, i in story_partial_sentences) == compare_ignore(
                            stimuli_story)
        stimulus_index = 0
        Stimulus = namedtuple(
            'Stimulus', ['story', 'sentence', 'sentence_num', 'sentence_part'])
        sentence_parts = defaultdict(lambda: 0)
        for partial_sentence, assembly_index in story_partial_sentences:
            full_partial_sentence = ''
            partial_sentence_index = 0
            while partial_sentence_index < len(partial_sentence) \
                    or stimulus_index < len(stimuli_story) \
                    and stimuli_story[stimulus_index] in compare_characters + [' ']:
                if partial_sentence_index < len(partial_sentence) \
                        and partial_sentence[partial_sentence_index].lower() \
                        == stimuli_story[stimulus_index].lower():
                    full_partial_sentence += stimuli_story[stimulus_index]
                    stimulus_index += 1
                    partial_sentence_index += 1
                elif stimuli_story[stimulus_index] in compare_characters + [
                        ' '
                ]:
                    # this case leads to a potential issue: Beginning quotations ' are always appended to
                    # the current instead of the next sentence. For now, I'm hoping this won't lead to issues.
                    full_partial_sentence += stimuli_story[stimulus_index]
                    stimulus_index += 1
                elif stimuli_story[stimulus_index] == '-':
                    full_partial_sentence += '-'
                    stimulus_index += 1
                    if partial_sentence[partial_sentence_index] == ' ':
                        partial_sentence_index += 1
                else:
                    raise NotImplementedError()
            sentence_num = next(
                index
                for index, start in enumerate(stimuli_story_sentence_starts)
                if start >= stimulus_index) - 1
            sentence_part = sentence_parts[sentence_num]
            sentence_parts[sentence_num] += 1
            row = Stimulus(sentence=full_partial_sentence,
                           story=story,
                           sentence_num=sentence_num,
                           sentence_part=sentence_part)
            aligned_stimulus_set.append(row)
            assembly_stimset[assembly_index] = stimulus_set_index
            stimulus_set_index += 1
        # check
        aligned_story = "".join(row.sentence for row in aligned_stimulus_set
                                if row.story == story)
        assert aligned_story == stimuli_story
    # build StimulusSet
    aligned_stimulus_set = StimulusSet(aligned_stimulus_set)
    aligned_stimulus_set['stimulus_id'] = [
        ".".join([str(value) for value in values]) for values in zip(*[
            aligned_stimulus_set[coord].values
            for coord in ['story', 'sentence_num', 'sentence_part']
        ])
    ]
    aligned_stimulus_set.name = f"{stimulus_set.name}-aligned"

    # align assembly
    alignment = [
        stimset_idx
        for assembly_idx, stimset_idx in sorted(assembly_stimset.items(),
                                                key=operator.itemgetter(0))
    ]
    assembly_coords = {
        **{
            coord: (dims, values)
            for coord, dims, values in walk_coords(assembly)
        },
        **{
            'stimulus_id': ('presentation', aligned_stimulus_set['stimulus_id'].values[alignment]),
            'meta_sentence': ('presentation', assembly['stimulus_sentence'].values),
            'stimulus_sentence': ('presentation', aligned_stimulus_set['sentence'].values[alignment])
        }
    }
    assembly = type(assembly)(assembly.values,
                              coords=assembly_coords,
                              dims=assembly.dims)

    return aligned_stimulus_set, assembly
Пример #6
0
def load_Pereira2018():
    data_dir = neural_data_dir / "Pereira2018"
    experiment2, experiment3 = "243sentences.mat", "384sentences.mat"
    stimuli = {}  # experiment -> stimuli
    assemblies = []
    subject_directories = [d for d in data_dir.iterdir() if d.is_dir()]
    for subject_directory in tqdm(subject_directories, desc="subjects"):
        for experiment_filename in [experiment2, experiment3]:
            data_file = subject_directory / f"examples_{experiment_filename}"
            if not data_file.is_file():
                _logger.debug(
                    f"{subject_directory} does not contain {experiment_filename}"
                )
                continue
            data = scipy.io.loadmat(str(data_file))

            # assembly
            assembly = data['examples']
            meta = data['meta']
            assembly = NeuroidAssembly(
                assembly,
                coords={
                    'experiment': ('presentation',
                                   [os.path.splitext(experiment_filename)[0]] *
                                   assembly.shape[0]),
                    'stimulus_num':
                    ('presentation', list(range(assembly.shape[0]))),
                    'passage_index':
                    ('presentation', data['labelsPassageForEachSentence'][:,
                                                                          0]),
                    'passage_label': ('presentation', [
                        data['keyPassages'][index - 1, 0][0]
                        for index in data['labelsPassageForEachSentence'][:, 0]
                    ]),
                    'passage_category': ('presentation', [
                        data['keyPassageCategory']
                        [0,
                         data['labelsPassageCategory'][index - 1, 0] - 1][0][0]
                        for index in data['labelsPassageForEachSentence']
                    ]),
                    'subject':
                    ('neuroid', [subject_directory.name] * assembly.shape[1]),
                    'voxel_num': ('neuroid', list(range(assembly.shape[1]))),
                    'AAL_roi_index':
                    ('neuroid', meta[0][0]['roiMultimaskAAL'][:, 0]),
                },
                dims=['presentation', 'neuroid'])
            stimulus_id = _build_id(assembly, ['experiment', 'stimulus_num'])
            assembly['stimulus_id'] = 'presentation', stimulus_id
            # set story for compatibility
            assembly['story'] = 'presentation', _build_id(
                assembly, ['experiment', 'passage_category'])
            assembly['neuroid_id'] = 'neuroid', _build_id(
                assembly, ['subject', 'voxel_num'])
            assemblies.append(assembly)

            # stimuli
            if experiment_filename not in stimuli:
                sentences = data['keySentences']
                sentences = [sentence[0] for sentence in sentences[:, 0]]
                stimuli[experiment_filename] = {
                    'sentence': sentences,
                    'sentence_num': list(range(len(sentences))),
                    'stimulus_id': stimulus_id,
                    'experiment': assembly['experiment'].values,
                    'story': assembly['story'].values,
                }
                for copy_coord in [
                        'experiment', 'story', 'passage_index',
                        'passage_label', 'passage_category'
                ]:
                    stimuli[experiment_filename][copy_coord] = assembly[
                        copy_coord].values

    _logger.debug(f"Merging {len(assemblies)} assemblies")
    assembly = merge_data_arrays(assemblies)

    _logger.debug("Creating StimulusSet")
    combined_stimuli = {}
    for key in stimuli[experiment2]:
        combined_stimuli[key] = np.concatenate(
            (stimuli[experiment2][key], stimuli[experiment3][key]))
    stimuli = StimulusSet(combined_stimuli)
    stimuli.name = "Pereira2018"
    assembly.attrs['stimulus_set'] = stimuli
    return assembly
Пример #7
0
def load_Fedorenko2016(electrodes, version):
    ressources_dir = Path(__file__).parent.parent.parent / 'ressources'
    neural_data_dir = ressources_dir / 'neural_data' / 'ecog-Fedorenko2016/'
    stim_data_dir = ressources_dir / 'stimuli' / 'sentences_8'
    _logger.info(f'Neural data directory: {neural_data_dir}')
    filepaths_stim = glob(os.path.join(stim_data_dir, '*.txt'))

    # ECoG
    data = None

    # For language responsive electrodes:
    if electrodes == 'language':

        # Create a subject ID list corresponding to language electrodes
        subject1 = np.repeat(1, 47)
        subject2 = np.repeat(2, 9)
        subject3 = np.repeat(3, 9)
        subject4 = np.repeat(4, 15)
        subject5 = np.repeat(5, 18)

        if version == 1:
            filepath_neural = glob(os.path.join(neural_data_dir, '*ecog.mat'))

        if version == 2:
            filepath_neural = glob(os.path.join(neural_data_dir, '*metadata_lang.mat'))
            
        if version == 3:
            subject1 = np.repeat(1, 47)
            subject2 = np.repeat(2, 8)
            subject3 = np.repeat(3, 9)
            subject4 = np.repeat(4, 15)
            subject5 = np.repeat(5, 18)
            
            filepath_neural = glob(os.path.join(neural_data_dir, '*g_lang_v3.mat'))
            
        if version == 4:            
            subject1 = np.repeat(1, 49)
            subject2 = np.repeat(2, 8)
            subject3 = np.repeat(3, 10)
            subject4 = np.repeat(4, 16)
            subject5 = np.repeat(5, 19)            
            subject6 = np.repeat(6, 3)
            
            filepath_neural = glob(os.path.join(neural_data_dir, '*g_lang_v4.mat'))

        _logger.debug(f'Running Fedorenko2016 benchmark with language responsive electrodes, data version: {version}')

    # For non-noisy electrodes
    if electrodes == 'all':

        # Create a subject ID list corresponding to all electrodes
        subject1 = np.repeat(1, 70)
        subject2 = np.repeat(2, 35)
        subject3 = np.repeat(3, 20)
        subject4 = np.repeat(4, 29)
        subject5 = np.repeat(5, 26)

        if version == 1:
            filepath_neural = glob(os.path.join(neural_data_dir, '*ecog_all.mat'))

        if version == 2:
            filepath_neural = glob(os.path.join(neural_data_dir, '*metadata_all.mat'))
            
        if version == 3:
            subject1 = np.repeat(1, 67)
            subject2 = np.repeat(2, 35)
            subject3 = np.repeat(3, 20)
            subject4 = np.repeat(4, 29)
            subject5 = np.repeat(5, 26)
            
            filepath_neural = glob(os.path.join(neural_data_dir, '*all_v3.mat'))
            
        if version == 4:
            subject1 = np.repeat(1, 63)
            subject2 = np.repeat(2, 35)
            subject3 = np.repeat(3, 21)
            subject4 = np.repeat(4, 29)
            subject5 = np.repeat(5, 27)
            subject6 = np.repeat(6, 9)
            
            filepath_neural = glob(os.path.join(neural_data_dir, '*all_v4.mat'))

        _logger.debug('Running Fedorenko2016 benchmark with non-noisy electrodes, data version: ', version)

        # For non-noisy electrodes
    if electrodes == 'non-language':
        
        if version == 1 or version == 2:
            filepath_neural = glob(os.path.join(neural_data_dir, '*nonlang.mat'))
    
            # Create a subject ID list corresponding to non-language electrodes
            subject1 = np.repeat(1, 28)
            subject2 = np.repeat(2, 31)
            subject3 = np.repeat(3, 14)
            subject4 = np.repeat(4, 19)
            subject5 = np.repeat(5, 16)
        
        if version == 3:
            filepath_neural = glob(os.path.join(neural_data_dir, '*nonlang_v3.mat'))
    
            # Create a subject ID list corresponding to non-language electrodes
            subject1 = np.repeat(1, 25) # 47 lang selective,
            subject2 = np.repeat(2, 31)
            subject3 = np.repeat(3, 14)
            subject4 = np.repeat(4, 19)
            subject5 = np.repeat(5, 16) # 10 lang electrodes in the non-noisy
            
        if version == 4:
            filepath_neural = glob(os.path.join(neural_data_dir, '*nonlang_v4.mat'))
    
            # Create a subject ID list corresponding to non-language electrodes
            subject1 = np.repeat(1, 22) 
            subject2 = np.repeat(2, 31)
            subject3 = np.repeat(3, 15)
            subject4 = np.repeat(4, 19)
            subject5 = np.repeat(5, 18) 
            subject6 = np.repeat(6, 6) 


        _logger.debug(f'Running Fedorenko2016 benchmark with non-language electrodes, data version: {version}')

    ecog_mat = sio.loadmat(filepath_neural[0])
    ecog_mtrix = ecog_mat['ecog']

    if version == 1:  # Manually z-score the version 1 data
        ecog_z = stats.zscore(ecog_mtrix, 1)
    if version == 2 or version == 3 or version == 4:
        ecog_z = ecog_mtrix

    ecog_mtrix_T = np.transpose(ecog_z)

    num_words = list(range(np.shape(ecog_mtrix_T)[0]))
    new_sent_idx = num_words[::8]

    # Average across word representations
    sent_avg_ecog = []
    for i in new_sent_idx:
        eight_words = ecog_mtrix_T[i:i + 8, :]
        sent_avg = np.mean(eight_words, 0)
        sent_avg_ecog.append(sent_avg)

    # Stimuli
    for filepath in filepaths_stim:
        with open(filepath, 'r') as file1:
            f1 = file1.readlines()

        _logger.debug(f1)

        sentences = []
        sentence_words, word_nums = [], []
        for sentence in f1:
            sentence = sentence.split(' ')
            sentences.append(sentence)
            word_counter = 0

            for word in sentence:
                if word == '\n':
                    continue
                word = word.rstrip('\n')
                sentence_words.append(word)
                word_nums.append(word_counter)
                word_counter += 1

        _logger.debug(sentence_words)

    # Create sentenceID list
    sentence_lst = list(range(0, 52))
    sentenceID = np.repeat(sentence_lst, 8)
    
    if version == 1 or version == 2 or version == 3:
        subjectID = np.concatenate([subject1, subject2, subject3, subject4, subject5], axis=0)

    if version == 4:
        subjectID = np.concatenate([subject1, subject2, subject3, subject4, subject5, subject6], axis=0)

    # Create a list for each word number
    word_number = list(range(np.shape(ecog_mtrix_T)[0]))

    # Add a pd df as the stimulus_set
    zipped_lst = list(zip(sentenceID, word_number, sentence_words))
    df_stimulus_set = StimulusSet(zipped_lst, columns=['sentence_id', 'stimulus_id', 'word'])
    df_stimulus_set.name = 'Fedorenko2016.ecog'

    # xarray
    electrode_numbers = list(range(np.shape(ecog_mtrix_T)[1]))
    assembly = xr.DataArray(ecog_mtrix_T,
                            dims=('presentation', 'neuroid'),
                            coords={'stimulus_id': ('presentation', word_number),
                                    'word': ('presentation', sentence_words),
                                    'word_num': ('presentation', word_nums),
                                    'sentence_id': ('presentation', sentenceID),
                                    'electrode': ('neuroid', electrode_numbers),
                                    'neuroid_id': ('neuroid', electrode_numbers),
                                    'subject_UID': ('neuroid', subjectID),  # Name is subject_UID for consistency
                                    })

    assembly.attrs['stimulus_set'] = df_stimulus_set  # Add the stimulus_set dataframe
    data = assembly if data is None else xr.concat(data, assembly)
    return NeuroidAssembly(data)