def test_read_config(self): ''' Tests :py:func:`bella.helper.read_config` ''' dong_test_fp = 'tests/test_data/dong_test_data.txt' assert dong_test_fp in read_config('dong_data_test', CONFIG_FP) with self.assertRaises(ValueError, msg='nothing here should not be in the ' 'config.yaml'): read_config('nothing here', CONFIG_FP) test_config_name = Path('./doesnotexist') with self.assertRaises(FileNotFoundError, msg='there should be no file named ' f'{test_config_name}'): read_config('test_data', test_config_name)
def tweebo(texts): ''' Given a list of Strings will tokenise, pos tag and then dependecy parse the text using `Tweebo <https://github.com/ikekonglp/TweeboParser>`_ a Tweet specific parser. The Tweebo parser cannot handle no strings therefore a special empty string symbol is required. If one of the texts is an empty String then an empty list will be returned for that index of the returned list. :param texts: The texts that are to be parsed :type text: list :returns: A list of of a list of DependencyToken instances. A list per text \ in the texts argument. :rtype: list ''' def no_text(text): ''' Given a String checks if it is empty if so returns an empty_token else the text that was given. :param text: Text to be checked :type text: String :returns: The text if it is not empty or empty token if it is. :rtype: String ''' empty_token = '$$$EMPTY$$$' if text.strip() == '': return empty_token return text with tempfile.TemporaryDirectory() as working_dir: with tempfile.TemporaryDirectory() as temp_dir: text_file_path = os.path.join(temp_dir, 'text_file.txt') result_file_path = os.path.join(temp_dir, 'text_file.txt.predict') tweebo_dir = full_path( read_config('depdency_parsers')['tweebo_dir']) with open(text_file_path, 'w+') as text_file: for text in texts: text = no_text(text) text_file.write(text) text_file.write('\n') run_script = os.path.join(tweebo_dir, 'python_run.sh') if subprocess.run( ['bash', run_script, text_file_path, working_dir]): with open(result_file_path, 'r') as result_file: return tweebo_post_process(result_file.read()) else: raise SystemError('Could not run the Tweebo run script {}'\ .format(run_script))
def get_lexicon(self): ''' Overrides :py:func@`bella.lexicons.Lexicon.get_lexicon` ''' sentiment_folder = full_path(read_config('lexicons')['hu_liu']) cats = ['positive', 'negative'] word_cat = [] for cat in cats: file_path = os.path.join(sentiment_folder, '{}-words.txt'.format(cat)) with open(file_path, 'r', encoding='cp1252') as senti_file: for line in senti_file: if re.search('^;', line) or re.search(r'^\W+', line): continue line = line.strip() word_cat.append((line.strip(), cat)) return word_cat
def get_lexicon(self): ''' Overrides :py:func:`bella.lexicons.Lexicon.get_lexicon` ''' emotion_file_path = full_path(read_config('lexicons')['nrc_emotion']) word_cat = [] with open(emotion_file_path, 'r', newline='') as emotion_file: tsv_reader = csv.reader(emotion_file, delimiter='\t') for row in tsv_reader: if len(row): word = row[0] cat = row[1] association = int(row[2]) if association: word_cat.append((word, cat)) return word_cat
def tweebo_install(tweebo_func): ''' Python decorator that ensures that `TweeboParser <https://github.com/ikekonglp/TweeboParser>`_ is installed, before running the function it wraps. Returns the given function. :param tweebo_func: A function that uses the Tweebo Parser. :type tweebo_func: function :returns: The given function :rtype: function ''' tweebo_dir = full_path(read_config('depdency_parsers')['tweebo_dir']) # If the models file exists then Tweebo has been installed or failed to # install tweebo_models = os.path.join(tweebo_dir, 'pretrained_models.tar.gz') if not os.path.isfile(tweebo_models): install_script = os.path.join(tweebo_dir, 'install.sh') subprocess.run(['bash', install_script]) return tweebo_func
def get_lexicon(self): ''' Overrides :py:func@`bella.lexicons.Lexicon.get_lexicon` ''' mpqa_file_path = full_path(read_config('lexicons')['mpqa']) word_cats = [] with open(mpqa_file_path, 'r') as mpqa_file: for line in mpqa_file: line = line.strip() if line: key_values = {} for data in line.split(): if '=' in data: key, value = data.split('=') key_values[key] = value word = key_values['word1'] cat = key_values['priorpolarity'] if cat == 'weakneg': cat = key_values['polarity'] word_cats.append((word, cat)) return word_cats
def test_gensim_word2vec(self): ''' Tests the :py:class:`bella.word_vectors.GensimVectors` ''' # Test loading word vectors from a file vo_zhang = VoVectors(skip_conf=True) self.assertEqual(vo_zhang.vector_size, 100, msg='Vector size should be equal'\ ' to 100 not {}'.format(vo_zhang.vector_size)) # Check zero vectors work for OOV words zero_vector = np.zeros(100) oov_word = 'thisssssdoesssssnotexists' oov_vector = vo_zhang.lookup_vector(oov_word) self.assertEqual(True, np.array_equal(oov_vector, zero_vector), msg='This word {} should not exists and have a zero '\ 'vector and not {}'.format(oov_word, oov_vector)) # Check it does get word vectors the_vector = vo_zhang.lookup_vector('the') self.assertEqual(False, np.array_equal(the_vector, zero_vector), msg='The word `the` should have a non-zero vector.') with self.assertRaises(ValueError, msg='Should raise a value for any param'\ 'that is not a String and this is a list'): vo_zhang.lookup_vector(['the']) # Check if the word, index and vector lookups match index_word = vo_zhang.index2word word_index = vo_zhang.word2index the_index = word_index['the'] self.assertEqual('the', index_word[the_index], msg='index2word and '\ 'word2index do not match for the word `the`') index_vector = vo_zhang.index2vector the_vectors_match = np.array_equal(index_vector[the_index], vo_zhang.lookup_vector('the')) self.assertEqual(True, the_vectors_match, msg='index2vector does not match'\ ' lookup_vector func for the word `the`') # Test the constructor test_file_path = 'this' with self.assertRaises(Exception, msg='The file path should have no saved '\ 'word vector file {} and there is no training data'\ .format(test_file_path)): GensimVectors(test_file_path, 'fake data', model='word2vec') with self.assertRaises(Exception, msg='Should not accept neither no saved '\ 'word vector model nor no training data'): GensimVectors(None, None, model='word2vec') with self.assertRaises(Exception, msg='Should only accept the following models'\ ' {}'.format(['word2vec', 'fasttext'])): GensimVectors(None, [['hello', 'how', 'are']], model='nothing', min_count=1) # Test creating vectors from data data_path = os.path.abspath( read_config('sherlock_holmes_test', CONFIG_FP)) with open(data_path, 'r') as data: data = map(tokenisers.whitespace, data) with tempfile.NamedTemporaryFile() as temp_file: data_vector = GensimVectors(temp_file.name, data, model='word2vec', size=200, name='sherlock') d_vec_size = data_vector.vector_size self.assertEqual(d_vec_size, 200, msg='Vector size should be 200 not'\ ' {}'.format(d_vec_size)) sherlock_vec = data_vector.lookup_vector('sherlock') self.assertEqual(False, np.array_equal(zero_vector, sherlock_vec), msg='Sherlock should be a non-zero vector') # Test that it saved the trained model saved_vector = GensimVectors(temp_file.name, None, model='word2vec') s_vec_size = saved_vector.vector_size self.assertEqual(s_vec_size, 200, msg='Vector size should be 200 not'\ ' {}'.format(s_vec_size)) equal_sherlocks = np.array_equal( sherlock_vec, saved_vector.lookup_vector('sherlock')) self.assertEqual(True, equal_sherlocks, msg='The saved model and '\ 'the trained model should have the same vectors') # Ensure the name attributes works self.assertEqual('sherlock', data_vector.name, msg='The name '\ 'of the instance should be sherlock and not {}'\ .format(data_vector.name))
def test_dong(self): ''' Tests :py:func:`bella.parsers.dong` ''' def check_results(expected_results, test_results): ''' Given the expected results and the results from the function beign tested it will test that they are both equal. It will return nothing but will test if they are correct else it fails the tests. :param expected_results: A list of dictionaries containing expected values :param test_results: A list of dictionaries containing results from the function that is being tested :type expected_results: list :type test_results: list :returns: Nothing but checks if the results are to be expected :rtype: None ''' for index, expected_result in enumerate(expected_results): test_result = test_results[index] for key, expected_value in expected_result.items(): test_value = test_result[key] self.assertIsInstance(expected_value, type(test_value), msg='The expected value : {} is not of the '\ 'same type as the tested value : {}'\ .format(type(expected_value), type(test_value))) if key == 'spans': test_value = sorted(test_value, key=lambda x: x[0]) expected_value = sorted(expected_value, key=lambda x: x[0]) self.assertEqual(expected_value, test_value, msg='Expected {} returned {}'.format( expected_value, test_value)) test_file_path = 'anything' with self.assertRaises(FileNotFoundError, msg='there should be no file named {}'\ .format(test_file_path)): dong(test_file_path) test_file_path = './tests/test_data/dong_test_data.txt' expected_results = [{ 'target_id': 'dong_test_data0', 'sentence_id': 'dong_test_data0', 'sentiment': -1, 'text': 'This is a fake news article that is to represent a Tweet!!!!', 'target': 'news article', 'spans': [(15, 27)] }, { 'target_id': 'dong_test_data1', 'sentence_id': 'dong_test_data1', 'sentiment': 1, 'text': 'I had a great day however I did not get much work done', 'target': 'day', 'spans': [(14, 17)] }, { 'target_id': 'dong_test_data2', 'sentence_id': 'dong_test_data2', 'sentiment': 0, 'text': 'I cycled in today and it was ok as it was not raining.', 'target': 'cycled', 'spans': [(2, 8)] }] check_results(expected_results, dong(test_file_path).data()) bad_sent_path = read_config('dong_bad_sent_data_test', CONFIG_FP) with self.assertRaises(ValueError, msg='It should not accept sentiment '\ 'values that are not 1, 0, or -1'): dong(bad_sent_path) # Ensure that it can handle the same target with multiple spans test_multiple_path = read_config('dong_multiple_offsets_data_test', CONFIG_FP) multi_expected = [{'target_id':'dong_test_multiple_offsets_data0', 'sentence_id':'dong_test_multiple_offsets_data0', 'sentiment':-1, 'text':'This is a fake news article that is to represent a '\ 'Tweet!!!! and it was an awful News Article I think.', 'target':'news article', 'spans':[(15, 27), (81, 93)]}, {'target_id':'dong_test_multiple_offsets_data1', 'sentence_id':'dong_test_multiple_offsets_data1', 'sentiment':1, 'text':'I had a great Day however I did not get much '\ 'work done in the day', 'target':'day', 'spans':[(14, 17), (62, 65)]}] check_results(multi_expected, dong(test_multiple_path).data()) # Test that multi word targets that should have a space between them # are still detected test_mwe_path = read_config('dong_mwe_offsets_data_test', CONFIG_FP) mwe_expected = [{'target_id':'dong_test_mwe_offsets_data0', 'sentence_id':'dong_test_mwe_offsets_data0', 'sentiment':-1, 'text':'This is a fake news article that is to represent a '\ 'Tweet!!!! and it was an awful NewsArticle I think.', 'target':'news article', 'spans':[(15, 27), (81, 92)]}] check_results(mwe_expected, dong(test_mwe_path).data())