def save_stim_filename(stimulus): """ Given a pliers stimulus object, create a hash, filename, and save. If type if TextStim or ComplexTextStim, return content rather than path """ if isinstance(stimulus, TextStim): stimulus = ComplexTextStim(text=stimulus.data) stim_hash = hash_stim(stimulus) if isinstance(stimulus, ComplexTextStim): return stim_hash, None, stimulus.data else: basepath = Path(current_app.config['STIMULUS_DIR']).absolute() stim_types = {ImageStim: '.png', VideoFrameStim: '.png', VideoStim: '.mkv', AudioStim: '.wav'} ext = [e for c, e in stim_types.items() if isinstance(stimulus, c)][0] path = (basepath / stim_hash).with_suffix(ext) path.parents[0].mkdir(exist_ok=True) stimulus.save(path.as_posix()) return stim_hash, path, None
def test_word_counter_extractor(): stim_txt = ComplexTextStim(text='This is a text where certain words occur' ' again and again Sometimes they are ' 'lowercase sometimes they are uppercase ' 'There are also words that may look ' 'different but they come from the same ' 'lemma Take a word like text and its ' 'plural texts Oh words') stim_with_onsets = ComplexTextStim(filename=join(TEXT_DIR, 'complex_stim_with_repetitions.txt')) ext = WordCounterExtractor() result_stim_txt = ext.transform(stim_txt).to_df() result_stim_with_onsets = ext.transform(stim_with_onsets).to_df() assert result_stim_txt.shape[0] == 45 assert all(result_stim_txt['word_count'] >= 1) assert result_stim_txt['word_count'][15] == 2 assert result_stim_txt['word_count'][44] == 3 assert result_stim_with_onsets.shape[0] == 8 assert result_stim_with_onsets['onset'][2] == 0.8 assert result_stim_with_onsets['duration'][2] == 0.1 assert result_stim_with_onsets['word_count'][2] == 2 assert result_stim_with_onsets['word_count'][5] == 2 assert result_stim_with_onsets['word_count'][7] == 1 ext2 = WordCounterExtractor(log_scale=True) result_stim_txt = ext2.transform(stim_txt).to_df() assert all(result_stim_txt['log_word_count'] >= 0) assert result_stim_txt['log_word_count'][15] == np.log(2) assert result_stim_txt['log_word_count'][44] == np.log(3)
def test_word_stemming_filter(): stim = ComplexTextStim(join(TEXT_DIR, 'sample_text.txt'), columns='to', default_duration=1) # With all defaults (porter stemmer) filt = WordStemmingFilter() assert isinstance(filt.stemmer, nls.PorterStemmer) stemmed = filt.transform(stim) stems = [s.text for s in stemmed] target = ['some', 'sampl', 'text', 'for', 'test', 'annot'] assert stems == target # Try a different stemmer filt = WordStemmingFilter(stemmer='snowball', language='english') assert isinstance(filt.stemmer, nls.SnowballStemmer) stemmed = filt.transform(stim) stems = [s.text for s in stemmed] assert stems == target # Handles StemmerI stemmer stemmer = nls.SnowballStemmer(language='english') filt = WordStemmingFilter(stemmer=stemmer) stemmed = filt.transform(stim) stems = [s.text for s in stemmed] assert stems == target # Try lemmatization filter try: nltk.find('taggers/universal_tagset') except LookupError: nltk.download('universal_tagset') try: nltk.find('corpora/wordnet') except LookupError: nltk.download('wordnet') stim = ComplexTextStim(text='These are tests for Stemming filters') filt = WordStemmingFilter(stemmer='wordnet') lemmatized = filt.transform(stim) lemmas = [l.text for l in lemmatized] target = ['these', 'be', 'test', 'for', 'stem', 'filter'] assert lemmas == target # Try case sensitive filt = WordStemmingFilter(stemmer='wordnet', case_sensitive=True) lemmatized = filt.transform(stim) lemmas = [l.text for l in lemmatized] target = ['These', 'be', 'test', 'for', 'Stemming', 'filter'] assert lemmas == target # Fails on invalid values with pytest.raises(ValueError): filt = WordStemmingFilter(stemmer='nonexistent_stemmer') # Try a long text stim stim2 = TextStim(text='theres something happening here') filt = WordStemmingFilter() assert filt.transform(stim2).text == 'there someth happen here'
def test_bert_sequence_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) ext_pooler = BertSequenceEncodingExtractor(return_special='pooler_output') # Test correct behavior when setting return_special assert ext_pooler.pooling is None assert ext_pooler.return_special == 'pooler_output' res_sequence = BertSequenceEncodingExtractor( return_input=True).transform(stim).to_df() res_file = BertSequenceEncodingExtractor( return_input=True).transform(stim_file).to_df() res_cls = BertSequenceEncodingExtractor( return_special='[CLS]').transform(stim).to_df() res_pooler = ext_pooler.transform(stim).to_df() res_max = BertSequenceEncodingExtractor( pooling='max').transform(stim).to_df() # Check shape assert len(res_sequence['encoding'][0]) == 768 assert len(res_cls['encoding'][0]) == 768 assert len(res_pooler['encoding'][0]) == 768 assert len(res_max['encoding'][0]) == 768 assert res_sequence.shape[0] == 1 assert res_cls.shape[0] == 1 assert res_pooler.shape[0] == 1 assert res_max.shape[0] == 1 # Make sure pooler/cls/no arguments return different encodings assert res_sequence['encoding'][0] != res_cls['encoding'][0] assert res_sequence['encoding'][0] != res_pooler['encoding'][0] assert res_sequence['encoding'][0] != res_max['encoding'][0] assert all([ res_max['encoding'][0][i] >= res_sequence['encoding'][0][i] for i in range(768) ]) # test return sequence assert res_sequence['sequence'][0] == 'This is not a tokenized sentence .' # test file stim assert res_file['duration'][0] == 2.9 assert res_file['onset'][0] == 0.2 # catch error with wrong numpy function and wrong special token arg with pytest.raises(ValueError) as err: BertSequenceEncodingExtractor(pooling='avg') assert 'valid numpy function' in str(err.value) with pytest.raises(ValueError) as err: BertSequenceEncodingExtractor(return_special='[MASK]') assert 'must be one of' in str(err.value) # remove variables del ext_pooler, res_cls, res_max, res_pooler, res_sequence, res_file, stim
def test_complex_text_stim(): text_dir = join(get_test_data_path(), 'text') stim = ComplexTextStim(join(text_dir, 'complex_stim_no_header.txt'), columns='ot', default_duration=0.2) assert len(stim.elements) == 4 assert stim.elements[2].onset == 34 assert stim.elements[2].duration == 0.2 stim = ComplexTextStim(join(text_dir, 'complex_stim_with_header.txt')) assert len(stim.elements) == 4 assert stim.elements[2].duration == 0.1
def test_complex_stim_from_text(): textfile = join(get_test_data_path(), 'text', 'scandal.txt') text = open(textfile).read().strip() stim = ComplexTextStim(text=text) target = ['To', 'Sherlock', 'Holmes'] assert [w.text for w in stim.elements[:3]] == target assert len(stim.elements) == 231 stim = ComplexTextStim(text=text, unit='sent') # Custom tokenizer stim = ComplexTextStim(text=text, tokenizer='(\w+)') assert len(stim.elements) == 209
def run_fave(input_transcript, input_media, output_file, onset=None, offset=None): transcript, audio = clean_transcript( input_transcript, input_media, onset, offset) text_grid = '/tmp/output.textGrid' bashCommand = "python2 FAAValign.py -n {} {} {}".format( audio, transcript, text_grid) subprocess.call(bashCommand.split()) stim = ComplexTextStim(elements=parse_textgrid(text_grid)) stim.save(output_file)
def test_bert_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) ext_base = BertExtractor(pretrained_model='bert-base-uncased') ext_base_token = BertExtractor(pretrained_model='bert-base-uncased', return_input=True) ext_tf = BertExtractor(pretrained_model='bert-base-uncased', framework='tf') base_result = ext_base.transform(stim) res = base_result.to_df() res_model_attr = base_result.to_df(include_attributes=True) res_token = ext_base_token.transform(stim).to_df() res_file = ext_base.transform(stim_file).to_df() res_tf = ext_tf.transform(stim).to_df() # Test encoding shape assert len(res['encoding'][0]) == 768 assert len(res_file['encoding'][0]) == 768 # test base extractor assert res.shape[0] == 8 assert res_token.shape[0] == 8 assert res_token['token'][5] == '##ized' assert res_token['word'][5] == 'tokenized' assert res_token['object_id'][5] == 5 # test base extractor on file assert res_file.shape[0] == 8 assert res_file['onset'][3] == 1.3 assert res_file['duration'][5] == 0.5 assert res_file['object_id'][5] == 5 # test tf vs torch cors = [ np.corrcoef(res['encoding'][i], res_tf['encoding'][i])[0, 1] for i in range(res.shape[0]) ] assert all(np.isclose(cors, 1)) # catch error if framework is invalid with pytest.raises(ValueError) as err: BertExtractor(framework='keras') assert 'Invalid framework' in str(err.value) # Delete the models del res, res_token, res_file, ext_base, ext_base_token
def test_bert_other_models(model): if model == 'camembert-base': stim = ComplexTextStim(text='ceci n\'est pas un pipe') else: stim = ComplexTextStim(text='This is not a tokenized sentence.') res = BertExtractor(pretrained_model=model, return_input=True).transform(stim).to_df() if model == 'bert-large-uncased': shape = 1024 else: shape = 768 assert len(res['encoding'][0]) == shape if model == 'camembert-base': assert res['token'][4] == 'est' # remove variables del res, stim
def test_part_of_speech_extractor(): stim = ComplexTextStim(join(TEXT_DIR, 'complex_stim_with_header.txt')) result = PartOfSpeechExtractor().transform(stim).to_df() assert result.shape == (4, 6) assert 'NN' in result.columns assert result['NN'].sum() == 1 assert result['VBD'][3] == 1
def clean_transcript(input_transcript, input_media, onset=None, offset=None): stim = load_stims([input_media])[0] if not isinstance(stim, AudioStim): conv = VideoToAudioConverter() stim = conv.transform(stim) input_media = '/tmp/input_audio.wav' stim.save(input_media) _, extension = splitext(input_transcript) clean_transcript = '/tmp/clean_transcript.txt' with open(clean_transcript, 'w') as new_file: if extension == 'srt': txt = ComplexTextStim(input_transcript) for el in txt.elements: _clean_save(el.text, new_file, el.onset, el.duration) else: # Treat as a singe block of text if onset is None or offset is None: raise Exception("Onset and offset must be declared") txt = TextStim(input_transcript) _clean_save(txt.text, new_file, onset, stim.duration - offset) return clean_transcript, input_media
def test_indico_api_text_extractor(): ext = IndicoAPITextExtractor(api_key=os.environ['INDICO_APP_KEY'], models=['emotion', 'personality']) # With ComplexTextStim input srtfile = join(get_test_data_path(), 'text', 'wonderful.srt') srt_stim = ComplexTextStim(srtfile, onset=4.2) result = ext.transform(srt_stim).to_df() outdfKeysCheck = set([ 'onset', 'duration', 'emotion_anger', 'emotion_fear', 'emotion_joy', 'emotion_sadness', 'emotion_surprise', 'personality_openness', 'personality_extraversion', 'personality_agreeableness', 'personality_conscientiousness']) assert set(result.columns) == outdfKeysCheck assert result['onset'][1] == 92.622 # With TextStim input ts = TextStim(text="It's a wonderful life.") result = ext.transform(ts).to_df() assert set(result.columns) == outdfKeysCheck assert len(result) == 1
def test_indico_api_text_extractor(): ext = IndicoAPITextExtractor(api_key=os.environ['INDICO_APP_KEY'], models=['emotion', 'personality']) # With ComplexTextStim input srtfile = join(get_test_data_path(), 'text', 'wonderful.srt') srt_stim = ComplexTextStim(srtfile, onset=4.2) result = merge_results(ext.transform(srt_stim), extractor_names=False) outdfKeysCheck = { 'onset', 'duration', 'order', 'object_id', 'emotion_anger', 'emotion_fear', 'emotion_joy', 'emotion_sadness', 'emotion_surprise', 'personality_openness', 'personality_extraversion', 'personality_agreeableness', 'personality_conscientiousness' } meta_columns = {'source_file', 'history', 'class', 'filename'} assert set(result.columns) - set(['stim_name' ]) == outdfKeysCheck | meta_columns assert result['onset'][1] == 92.622 # With TextStim input ts = TextStim(text="It's a wonderful life.") result = ext.transform(ts).to_df(object_id=True) assert set(result.columns) == outdfKeysCheck assert len(result) == 1
def test_compound_stim(): audio_dir = join(get_test_data_path(), 'audio') audio = AudioStim(join(audio_dir, 'crowd.mp3')) image1 = ImageStim(join(get_test_data_path(), 'image', 'apple.jpg')) image2 = ImageStim(join(get_test_data_path(), 'image', 'obama.jpg')) filename = join(get_test_data_path(), 'video', 'small.mp4') video = VideoStim(filename) text = ComplexTextStim(text="The quick brown fox jumped...") stim = CompoundStim([audio, image1, image2, video, text]) assert len(stim.elements) == 5 assert isinstance(stim.video, VideoStim) assert isinstance(stim.complex_text, ComplexTextStim) assert isinstance(stim.image, ImageStim) with pytest.raises(AttributeError): stim.nonexistent_type assert stim.video_frame is None # Test iteration len([e for e in stim]) == 5 imgs = stim.get_stim(ImageStim, return_all=True) assert len(imgs) == 2 assert all([isinstance(im, ImageStim) for im in imgs]) also_imgs = stim.get_stim('image', return_all=True) assert imgs == also_imgs
def test_check_target_type(): stim = ComplexTextStim(join(TEXT_DIR, 'sample_text.txt'), columns='to', default_duration=1) td = SharpnessExtractor() with pytest.raises(TypeError): td.transform(stim)
def test_metric_er_as_stim(): stim = ComplexTextStim(text='This is [MASK] test') ext_bert = BertLMExtractor(return_softmax=True) ext_metric = MetricExtractor(functions='numpy.sum') r = ext_metric.transform(ext_bert.transform(stim)) df = merge_results(r, extractor_names=False) assert np.isclose(df['sum'][0], 1)
def test_transcribed_audio_stim(): audio = AudioStim(join(get_test_data_path(), 'audio', "barber_edited.wav")) text_file = join(get_test_data_path(), 'text', "wonderful_edited.srt") text = ComplexTextStim(text_file) stim = TranscribedAudioCompoundStim(audio=audio, text=text) assert isinstance(stim.audio, AudioStim) assert isinstance(stim.complex_text, ComplexTextStim)
def test_word_stemming_filter(): stim = ComplexTextStim(join(TEXT_DIR, 'sample_text.txt'), columns='to', default_duration=1) # With all defaults (porter stemmer) filt = WordStemmingFilter() assert isinstance(filt.stemmer, nls.PorterStemmer) stemmed = filt.transform(stim) stems = [s.text for s in stemmed] target = ['some', 'sampl', 'text', 'for', 'test', 'annot'] assert stems == target # Try a different stemmer filt = WordStemmingFilter(stemmer='snowball', language='english') assert isinstance(filt.stemmer, nls.SnowballStemmer) stemmed = filt.transform(stim) stems = [s.text for s in stemmed] assert stems == target # Handles StemmerI stemmer stemmer = nls.SnowballStemmer(language='english') filt = WordStemmingFilter(stemmer=stemmer) stemmed = filt.transform(stim) stems = [s.text for s in stemmed] assert stems == target # Fails on invalid values with pytest.raises(ValueError): filt = WordStemmingFilter(stemmer='nonexistent_stemmer') # Try a long text stim stim2 = TextStim(text='theres something happening here') filt = WordStemmingFilter() assert filt.transform(stim2).text == 'there someth happen here'
def test_complex_text_stim(): text_dir = join(get_test_data_path(), 'text') stim = ComplexTextStim(join(text_dir, 'complex_stim_no_header.txt'), columns='ot', default_duration=0.2) assert len(stim.elements) == 4 assert stim.elements[2].onset == 34 assert stim.elements[2].duration == 0.2 stim = ComplexTextStim(join(text_dir, 'complex_stim_no_header.txt'), columns='ot', default_duration=0.2, onset=4.2) assert stim.elements[2].onset == 38.2 assert stim.elements[1].onset == 24.2 stim = ComplexTextStim(join(text_dir, 'complex_stim_with_header.txt')) assert len(stim.elements) == 4 assert stim.elements[2].duration == 0.1 assert stim._to_sec((1.0, 42, 3, 0)) == 6123 assert stim._to_tup(6123) == (1.0, 42, 3, 0)
def test_part_of_speech_extractor(): import nltk nltk.download('tagsets') stim = ComplexTextStim(join(TEXT_DIR, 'complex_stim_with_header.txt')) result = merge_results(PartOfSpeechExtractor().transform(stim), extractor_names=False) assert result.shape == (4, 52) assert result['NN'].sum() == 1 assert result['VBD'][3] == 1
def test_complex_stim_from_srt(): srtfile = join(get_test_data_path(), 'text', 'wonderful.srt') textfile = join(get_test_data_path(), 'text', 'wonderful.txt') df = pd.read_csv(textfile, sep='\t') target = df["text"].tolist() srt_stim = ComplexTextStim(srtfile) texts = [sent.text for sent in srt_stim.elements] assert texts == target
def test_stim_iteration_converter(): textfile = join(get_test_data_path(), 'text', 'scandal.txt') stim = ComplexTextStim(text=open(textfile).read().strip()) words = ComplexTextIterator().transform(stim) assert len(words) == 231 assert isinstance(words[1], TextStim) assert words[1].text == 'Sherlock' assert str( words[1].history) == 'ComplexTextStim->ComplexTextIterator/TextStim'
def test_mean_amplitude_extractor(): audio = AudioStim(join(AUDIO_DIR, 'barber_edited.wav')) text_file = join(get_test_data_path(), 'text', 'wonderful_edited.srt') text = ComplexTextStim(text_file) stim = TranscribedAudioCompoundStim(audio=audio, text=text) ext = MeanAmplitudeExtractor() result = ext.transform(stim).to_df() targets = [-0.154661, 0.121521] assert np.allclose(result['mean_amplitude'], targets)
def test_text_extractor(): stim = ComplexTextStim(join(TEXT_DIR, 'sample_text.txt'), columns='to', default_duration=1) td = DictionaryExtractor(join(TEXT_DIR, 'test_lexical_dictionary.txt'), variables=['length', 'frequency']) assert td.data.shape == (7, 2) result = td.transform(stim)[2].to_df() assert result['duration'][0] == 1 assert result.shape == (1, 6) assert np.isclose(result['frequency'][0], 11.729, 1e-5)
def test_tfhub_text_transformer_tokens(): cstim = ComplexTextStim(join(TEXT_DIR, 'wonderful.txt')) tkn_ext = TFHubTextExtractor(ELECTRA_URL, features='token_encodings', output_key='sequence_output', preprocessor_url_or_path=TOKENIZER_URL) tkn_df = merge_results(tkn_ext.transform(cstim.elements[:3]), extractor_names=False) assert all([tkn_df['token_encodings'][i].shape == (128, 256) \ for i in range(tkn_df.shape[0])])
def test_bert_sentiment_extractor(): stim = ComplexTextStim(text='This is the best day of my life.') stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) res = BertSentimentExtractor().transform(stim).to_df() res_file = BertSentimentExtractor().transform(stim_file).to_df() res_seq = BertSentimentExtractor(return_input=True).transform(stim).to_df() res_softmax = BertSentimentExtractor(return_softmax=True).transform(stim).to_df() assert res.shape[0] == 1 assert res_file['onset'][0] == 0.2 assert res_file['duration'][0] == 2.9 assert all([s in res.columns for s in ['sent_pos', 'sent_neg']]) assert res_seq['sequence'][0] == 'This is the best day of my life .' assert all([res_softmax[s][0] >= 0 for s in ['sent_pos','sent_neg'] ]) assert all([res_softmax[s][0] <= 1 for s in ['sent_pos','sent_neg'] ]) # remove variables del res, res_file, res_seq, res_softmax
def test_transformations_on_compound_stim(): image1 = ImageStim(join(get_test_data_path(), 'image', 'apple.jpg')) image2 = ImageStim(join(get_test_data_path(), 'image', 'obama.jpg')) text = ComplexTextStim(text="The quick brown fox jumped...") stim = CompoundStim([image1, image2, text]) ext = BrightnessExtractor() results = ext.transform(stim) assert len(results) == 2 assert np.allclose(results[0].data[0], 0.88784294)
def test_tfhub_text_one_feature(): stim = TextStim(join(TEXT_DIR, 'scandal.txt')) cstim = ComplexTextStim(join(TEXT_DIR, 'wonderful.txt')) ext = TFHubTextExtractor(GNEWS_URL, output_key=None, features='embedding') df = merge_results(ext.transform(cstim), extractor_names=False) assert df.shape[0] == len(cstim.elements) true = hub.KerasLayer(GNEWS_URL)([cstim.elements[3].text])[0, 2].numpy() assert np.isclose(df['embedding'][3][2], true) with pytest.raises(ValueError) as err: TFHubTextExtractor(GNEWS_URL, output_key='key').transform(stim) assert 'not a dictionary' in str(err.value)
def test_save(): text_dir = join(get_test_data_path(), 'text') complextext_stim = ComplexTextStim(join(text_dir, 'complex_stim_no_header.txt'), columns='ot', default_duration=0.2) text_stim = TextStim(text='hello') video_stim = VideoStim(join(get_test_data_path(), 'video', 'small.mp4')) audio_stim = AudioStim(join(get_test_data_path(), 'audio', 'crowd.mp3')) image_stim = ImageStim(join(get_test_data_path(), 'image', 'apple.jpg')) stims = [complextext_stim, text_stim, video_stim, audio_stim, image_stim] for s in stims: path = tempfile.mktemp() + s._default_file_extension s.save(path) assert exists(path) os.remove(path)
def test_save(): cts_file = join(get_test_data_path(), 'text', 'complex_stim_no_header.txt') complextext_stim = ComplexTextStim(cts_file, columns='ot', default_duration=0.2) text_stim = TextStim(text='hello') audio_stim = AudioStim(join(get_test_data_path(), 'audio', 'crowd.mp3')) image_stim = ImageStim(join(get_test_data_path(), 'image', 'apple.jpg')) # Video gives travis problems stims = [complextext_stim, text_stim, audio_stim, image_stim] for s in stims: path = tempfile.mktemp() + s._default_file_extension s.save(path) assert exists(path) os.remove(path)