def test_target_set(self): ''' Tests Target set function ''' # Test normal cases target_example_int = Target([(3, 5), (6, 8)], '1', 'Iphone', 'text with Iphone', 1) predicted_sentiment = 0 target_example_int['predicted'] = 0 self.assertEqual(predicted_sentiment, target_example_int['predicted'], msg='Predicted sentiment value should be 0 and '\ 'not {}'.format(target_example_int['predicted'])) target_example_string = Target([(3, 5), (6, 8)], '1', 'Iphone', 'text with Iphone', 'pos') predicted_sentiment = 'neg' target_example_string['predicted'] = 'neg' self.assertEqual(predicted_sentiment, target_example_string['predicted'], msg='Predicted sentiment value should be `neg` and '\ 'not {}'.format(target_example_string['predicted'])) # Testing the errors with self.assertRaises(KeyError, msg='Should not allow you to set keys '\ 'other than `predicted`'): target_example_int['sentiment'] = 0
def test_targetcoll_get(self): ''' Test the __getitem__ function of TargetCollection ''' target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone', 'text with Iphone', 1) target_example_1 = Target([(1, 5)], '3', 'Iphone', 'text with Iphone', 1) target_example_2 = Target([(1, 2)], '2', 'Iphone', 'text with Iphone', 1) target_col = TargetCollection([target_example_0, target_example_1, target_example_2]) # Test normal case self.assertEqual(target_example_2, target_col['2'], msg='Cannot access '\ 'data using keys. key used {} collection {}'\ .format('2', target_col)) self.assertEqual(target_example_1, target_col.get('3'), msg='Cannot '\ 'access data using the get method.') self.assertEqual(None, target_col.get('5'), msg='Default value for '\ 'get not working should be None not {}'\ .format(target_col.get('5'))) # Test that it raises a KeyError when key does not exist with self.assertRaises(KeyError, msg='Should produce a key error when '\ 'the data does not exist'): target_col['5']
def test_targetcoll_add(self): ''' Test the add function of TargetCollection ''' target_col = TargetCollection() target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone', 'text with Iphone', 1) target_example_1 = Target([(1, 5)], '3', 'Iphone', 'text with Iphone', 1) # Ensure the normal case works target_col.add(target_example_0) self.assertEqual(target_col['1'], target_example_0, msg='Test that {}' \ ' has been added to {}'\ .format(target_example_0, target_col)) with self.assertRaises(TypeError, msg='Should not be able to add a dict'): target_col.add({'target_id' : '2'}) with self.assertRaises(ValueError, msg='Should not be able to add a '\ 'Target that has no `id`'): del target_example_1['target_id'] if 'target_id' in target_example_1: raise KeyError('{} should not contain `id` key'\ .format(target_example_1)) target_col.add(target_example_1)
def test_targetcoll_constructor(self): ''' Tests TargetCollection constructor ''' target_example = Target([(3, 5), (6, 8)], '1', 'Iphone', 'text with Iphone', 1) # Test that it will not accept anything but Target instances with self.assertRaises(TypeError, msg='The constructor should only '\ 'accept an interator as an argument'): TargetCollection(1) with self.assertRaises(TypeError, msg='The constructor should only '\ 'accept an interator of Target instances'): TargetCollection([1, 2, 3, 4]) # Should accept the following without any problems TargetCollection([]) TargetCollection([target_example]) TargetCollection() # Testing the case where the list of Targets contains duplicate keys another_example = Target([(3, 4)], '2', 'Keys', 'text with Keys', -1) dup_example = Target([(3, 10)], '1', 'Pixel', 'text with Pixels', 0) with self.assertRaises(KeyError, msg='Should raise an error as two of '\ 'the target instances have the same key'): TargetCollection([target_example, another_example, dup_example])
def test_target_eq(self): ''' Test the Target __eq__ method ''' target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone', 'text with Iphone', 1) target_example_1 = Target([(1, 5)], '3', 'Iphone', 'text with Iphone', 1) target_example_2 = Target([(1, 2)], '2', 'Iphone', 'text with Iphone', 1) target_example_dup0 = Target([(1, 2)], '1', 'S8', 'text with Samsung S8', 1) # Targets with the same ID should be True even if they have different # data self.assertEqual(target_example_0, target_example_dup0, msg='Should be equal as they have the same ID `1`') # Targets with the same minimum keys should be True target_example_dup1 = copy.deepcopy(target_example_1) del target_example_dup1['target_id'] self.assertEqual(target_example_1, target_example_dup1, msg='Should be'\ ' equal as they have the same minimum keys') # Normal case target_example_dup2 = copy.deepcopy(target_example_2) self.assertEqual(target_example_2, target_example_dup2, msg='Copies of'\ ' the same Target instance should be equal') # Test that it won't accept dicts with the same minimum keys dict_example_1 = {'target' : 'Iphone', 'sentiment' : 1, 'spans' : [(1, 5)], 'text' : 'text with Iphone'} test_equality = dict_example_1 == target_example_1 self.assertEqual(False, test_equality, msg='Should not accept dicts '\ 'even with the same minimum_keys')
def extract_targets(current_target, end_span, start_span, targets, target_spans, target_index, tweet_text, sentiment_data, tweet_id, target_sentiments): if current_target != []: target_word = ' '.join(current_target) end_span = start_span + len(target_word) targets.append(target_word) target_spans.append((start_span, end_span)) start_span, end_span = None, None current_target = [] target_index += 1 tweet_text = ' '.join(tweet_text) for index, target in enumerate(targets): target_id = '{}#{}'.format(tweet_id, index) target_sentiment = target_sentiments[index] target_span = target_spans[index] if tweet_text[target_span[0] : target_span[1]] != target: raise Exception('The target span {} does not match the '\ 'target word {} in {}'\ .format(target_span, target, tweet_text)) target_data = {'spans' : [target_span], 'target_id' : target_id, 'target' : target, 'text' : tweet_text, 'sentiment' : target_sentiment, 'sentence_id' : tweet_id} target_data = Target(**target_data) sentiment_data.add(target_data) return sentiment_data
def test_targetcoll_stored_sent(self): ''' Test the stored_sentiments function of TargetCollection ''' target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone', 'text with Iphone', 1) target_example_1 = Target([(1, 5)], '3', 'Iphone', 'text with Iphone', 1) target_example_2 = Target([(1, 2)], '2', 'Iphone', 'text with Iphone', -1) target_col = TargetCollection([target_example_0, target_example_1, target_example_2]) valid_sentiments = set([1, -1]) test_sentiments = target_col.stored_sentiments() self.assertEqual(valid_sentiments, test_sentiments, msg='The unique '\ 'sentiments in the TargetCollection should be {} and '\ 'not {}'.format(valid_sentiments, test_sentiments))
def dong(file_path): ''' Given file path to the `Li Dong <https://github.com/bluemonk482/tdparse/tree/master/data/lidong>`_ sentiment data it will parse the data and return it as a list of dictionaries. :param file_path: File Path to the annotated data :type file_path: String :returns: A TargetCollection containing Target instances. :rtype: TargetCollection ''' file_path = os.path.abspath(file_path) if not os.path.isfile(file_path): raise FileNotFoundError('This file does not exist {}'.format(file_path)) file_name, _ = os.path.splitext(os.path.basename(file_path)) sentiment_range = [-1, 0, 1] sentiment_data = TargetCollection() with open(file_path, 'r') as dong_file: sent_dict = {} for index, line in enumerate(dong_file): divisible = index + 1 line = line.strip() if divisible % 3 == 1: sent_dict['text'] = line elif divisible % 3 == 2: sent_dict['target'] = line elif divisible % 3 == 0: sentiment = int(line) if sentiment not in sentiment_range: raise ValueError('The sentiment has to be one of the '\ 'following values {} not {}'\ .format(sentiment_range, sentiment)) sent_dict['sentiment'] = int(line) text = sent_dict['text'].lower() target = sent_dict['target'].lower() offsets = [match.span() for match in re.finditer(target, text)] if len(target.split()) > 1: joined_target = ''.join(target.split()) offsets.extend([match.span() for match in re.finditer(joined_target, text)]) sent_dict['spans'] = offsets sent_id = file_name + str(len(sentiment_data)) # Sentence ID is the same as the target as there is only one # target per sentence sent_dict['sentence_id'] = sent_id sent_dict['target_id'] = sent_id sent_target = Target(**sent_dict) sentiment_data.add(sent_target) sent_dict = {} else: raise Exception('Problem') return sentiment_data
def test_targetcoll_add_preds(self): ''' Tests the add_pred_sentiment function of TargetCollection ''' target_example_int_0 = Target([(3, 5), (6, 8)], '1', 'Iphone', 'text with Iphone', 1) target_example_int_1 = Target([(1, 5)], '3', 'Iphone', 'text with Iphone', 1) target_example_int_2 = Target([(1, 2)], '2', 'Iphone', 'text with Iphone', -1) target_col_int = TargetCollection([target_example_int_0, target_example_int_1, target_example_int_2]) target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone', 'text with Iphone', 'pos') target_example_1 = Target([(1, 5)], '3', 'Iphone', 'text with Iphone', 'pos') target_example_2 = Target([(1, 2)], '2', 'Iphone', 'text with Iphone', 'neg') target_col_str = TargetCollection([target_example_0, target_example_1, target_example_2]) pred_sents = [2, 5, 4] target_col_int.add_pred_sentiment(pred_sents) self.assertEqual(5, target_col_int['3']['predicted'], msg='Predicted '\ 'sentiment not set correctly for `id` 3 should be 5 '\ 'and not {}'.format(target_col_int)) with self.assertRaises(KeyError, msg='The original Target instances '\ 'predicted sentiment should not be set as the '\ 'TargetCollection should have copied them.'): target_example_int_1['predicted'] pred_sents = ['neg', 'neu', 'pos'] target_col_str.add_pred_sentiment(pred_sents) self.assertEqual('neu', target_col_str['3']['predicted'], msg='Predicted '\ 'sentiment not set correctly for `id` 3 should be `neu` '\ 'and not {}'.format(target_col_str)) pred_sents = [1, 0, -1] target_col_str.add_pred_sentiment(pred_sents, mapper={1 : 'pos', 0 : 'neu', -1 : 'neg'}) self.assertEqual('neu', target_col_str['3']['predicted'], msg='Predicted '\ 'sentiment not set correctly for `id` 3 should be `neu` '\ 'and not {} using the mapper'.format(target_col_str)) with self.assertRaises(KeyError, msg='The original Target instances '\ 'predicted sentiment should not be set as the '\ 'TargetCollection should have copied them.'): target_example_1['predicted'] #with self.assertRaises(TypeError, msg='Should only accept list type '\ # 'not tuples'): # target_col_int.add_pred_sentiment((2, 5, 4)) with self.assertRaises(ValueError, msg='Should accept lists that are '\ ' the same size as the TargetCollection'): target_col_int.add_pred_sentiment([1, 2, 3, 4])
def test_targetcoll_data(self): ''' Test the data function of TargetCollection ''' target_col = TargetCollection() target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone', 'text with Iphone', 1) target_example_1 = Target([(1, 5)], '3', 'Iphone', 'text with Iphone', 1) target_col.add(target_example_0) target_col.add(target_example_1) all_data = target_col.data() self.assertEqual(target_example_0, all_data[0], msg='First data '\ 'returned should be the first inserted {} and not '\ '{}'.format(target_example_0, all_data[0])) self.assertEqual(target_example_1, all_data[1], msg='Second data '\ 'returned should be the second inserted {} and not '\ '{}'.format(target_example_1, all_data[1])) target_example_2 = Target([(1, 2)], '2', 'Iphone', 'text with Iphone', 1) del target_col['1'] target_col.add(target_example_2) all_data = target_col.data() self.assertEqual(target_example_1, all_data[0], msg='First data '\ 'returned should be the second inserted {} and not '\ '{} as the first has been removed'\ .format(target_example_1, all_data[0])) self.assertEqual(target_example_2, all_data[1], msg='Second data '\ 'returned should be the third inserted {} and not '\ '{} as the first has been removed'\ .format(target_example_2, all_data[1])) self.assertEqual(2, len(all_data), msg='The length of the data returned'\ 'shoudl be 2 and not {}'.format(len(all_data)))
def generate_stats(data_path: Path) -> Dict[str, Union[int, float]]: target_data = [] with data_path.open('r') as data_lines: for line in data_lines: line = json.loads(line) line['spans'] = [tuple(span) for span in line['spans']] target_data.append(Target(**line)) target_data = TargetCollection(target_data) target_stats = defaultdict(lambda: 0) data_size = len(target_data) target_stats['size'] = data_size for i in range(1, 3): target_stats[f'Distinct sentiment {i}'] = len( target_data.subset_by_sentiment(i)) for data in target_data.data_dict(): target_stats[data['sentiment']] += 1 for key, value in target_stats.items(): if key == 'size': continue target_stats[key] = value / data_size return target_stats
def test_targetcoll_set(self): ''' Test the __setitem__ function of TargetCollection ''' target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone', 'text with Iphone', 1) target_example_1 = Target([(1, 5)], '3', 'Iphone', 'text with Iphone', 1) target_example_2 = Target([(1, 2)], '2', 'Iphone', 'text with Iphone', 1) target_col = TargetCollection([target_example_0, target_example_1, target_example_2]) target_example_3 = Target([(2, 4)], '5', 'new', 'new text', 0) target_example_4 = Target([(1, 3)], '6', 'another', 'another text', 1) target_example_5 = Target([(1, 3)], '7', 'another', 'another text', 1) target_diff_1 = Target([(4, 5)], '3', 'test', 'test text', 0) # Normal case adding a new value target_col['5'] = target_example_3 self.assertEqual(target_col['5'], target_example_3, msg='Cannot add '\ 'new value. store {} value added {}'\ .format(target_col, target_example_3)) # If key already exists it cannot be added with self.assertRaises(KeyError, msg='Should not be able to add value '\ '{} as its key {} already exists {}'\ .format(target_diff_1, '3', target_col)): target_col['3'] = target_diff_1 with self.assertRaises(KeyError, msg='Value with a different `id` to '\ 'the key should fail. Key {} Value {}'\ .format('7', target_example_4)): target_col['7'] = target_example_4 # Should accept Target instance with no `id` del target_example_5['target_id'] if 'target_id' in target_example_5: raise KeyError('{} should not contain `id` key'\ .format(target_example_5)) target_col['8'] = target_example_5
def test_target_normlisation(self): ''' Tests target_normalisation ''' with self.assertRaises(TypeError, msg='target_dict parameter has to '\ 'be of type dict only'): target_normalisation(['anything']) test_values = [{'target_id':str(0), 'sentiment':-1, 'text':'This is a fake news articledd that is to represent a '\ 'Tweet!!!! and it was an awful News Articless I think.', 'target':'news article', 'spans':[(15, 27), (83, 95)]}, {'target_id':str(1), 'sentiment':1, 'text':'I had a great ssDay however I did not get much '\ 'work done in the days', 'target':'day', 'spans':[(16, 19), (64, 67)]}, {'target_id':str(2), 'sentiment':1, 'text':'I had a great ssDay however I did not get much '\ 'work done in the days', 'target':'day', 'spans':[(16, 19)]}, {'target_id':str(3), 'sentiment':1, 'text':'Day however I did not get much done', 'target':'day', 'spans':[(0, 3)]}, {'target_id':str(4), 'sentiment':1, 'text':'however I did not get much done in the day', 'target':'day', 'spans':[(39, 42)]}, {'spans': [(47, 80)], 'target_id': '2', 'target': 'Core Processing Unit temperatures', 'text': 'Temperatures were ok but I was not tracking'\ ' in Core Processing Unit temperatures.', 'sentiment': 0}, {'spans': [(1, 14), (15, 29)], 'target_id': '8', 'target': 'britney spears', 'text': "#britneyspears Britney Spears 's new single "\ "-3' debuts at #1: video: congratulations are "\ "in order ..", 'sentiment': 0}] valid_results = [('This is a fake news_article dd that is to represent '\ 'a Tweet!!!! and it was an awful news_article ss I '\ 'think.', 'news_article'), ('I had a great ss day however I did not get much work'\ ' done in the day s', 'day'), ('I had a great ss $day$ however I did not get much '\ 'work done in the days', '$day$'), ('day however I did not get much done', 'day'), ('however I did not get much done in the day', 'day'), ('Temperatures were ok but I was not tracking in '\ 'Core_ProcessingUnittemperatures .', 'Core_ProcessingUnittemperatures'), ("# britney_spears britney_spears 's new single " "-3' debuts at #1: video: congratulations are in order ..", 'britney_spears')] test_values = [Target(**test_value) for test_value in test_values] for index, test_value in enumerate(test_values): test_result = target_normalisation(test_value) valid_result = valid_results[index] self.assertEqual(valid_result, test_result, msg='Results is '\ '{} and should be {}. Test value {}'\ .format(test_result, valid_result, test_value))
similarity_field = np.array(target_data[similarity_field_name]) if is_language_model and threshold: similarity_field = target_data['original_perplexity'] - similarity_field above_original_perplexity_index = np.argmax((similarity_field <= 0) + 0) similarity_field = similarity_field[:above_original_perplexity_index] if len(similarity_field) < k_similar: temp_k = len(similarity_field) elif (not is_language_model) and threshold: above_threshold_index = np.argmin((similarity_field >= threshold) + 0) similarity_field = similarity_field[:above_threshold_index] if len(similarity_field) < k_similar: temp_k = len(similarity_field) # For each of the filtered alternative targets it creates a json # like object that will be used to store it in a collection to then # save to a json file alternative_targets = target_data['alternative_targets'][:temp_k] for index, alternative_target in enumerate(alternative_targets): new_target_id = f'{target_id}_{index}' new_target_data = exchange_targets(target_data, alternative_target, new_target_id) # sanitizing the target dataset. new_target_data.pop('alternative_targets') new_target_data.pop(similarity_field_name) if is_language_model: new_target_data.pop('original_perplexity') new_target_dataset.append(Target(**new_target_data)) print(f'Size of the expanded dataset {len(new_target_dataset)}') new_target_dataset = TargetCollection(new_target_dataset) new_target_dataset.to_json_file(str(args.dataset_save_fp),cache=False)
def _semeval_extract_data(sentences, file_name, conflict=False, sentence_ids_skip=None): ''' :param sentences: A `sentences` named element :param file_name: Name of the file being parsed :param conflict: Determine if to keep the target data that has a conflict \ sentiment label. :param sentence_ids_skip: IDs of sentences that should be skipped :type sentences: xml.etree.ElementTree.Element :type file_name: String :type conflict: bool. Defailt False :type sentence_ids_skip: list. Default None :returns: A TargetCollection containing Target instances. :rtype: TargetCollection ''' # Converts the sentiment tags from Strings to ints sentiment_mapper = {'conflict' : -2, 'negative' : -1, 'neutral' : 0, 'positive' : 1} def extract_aspect_terms(aspect_terms, sentence_id): ''' :param aspect_terms: An aspectTerms element within the xml tree :param sentence_id: Id of the sentence that the aspects came from. :type aspect_terms: xml.etree.ElementTree.Element :type sentence_id: String :returns: A list of dictioanries containg id, span, sentiment and \ target :rtype: list ''' aspect_terms_data = [] for index, aspect_term in enumerate(aspect_terms): aspect_term = aspect_term.attrib aspect_term_data = {} sentiment = sentiment_mapper[aspect_term['polarity']] if sentiment == -2 and not conflict: continue aspect_id = '{}{}'.format(sentence_id, index) aspect_term_data['target_id'] = aspect_id if 'term' in aspect_term: aspect_term_data['target'] = aspect_term['term'] elif 'target' in aspect_term: aspect_term_data['target'] = aspect_term['target'] else: raise KeyError('There is no `target` attribute in the opinions '\ 'element {}'.format(aspect_term)) aspect_term_data['sentiment'] = sentiment aspect_term_data['spans'] = [(int(aspect_term['from']), int(aspect_term['to']))] aspect_term_data['sentence_id'] = sentence_id # If the target is NULL then there is no target if aspect_term_data['target'] == 'NULL': continue aspect_terms_data.append(aspect_term_data) return aspect_terms_data def add_text(aspect_data, text): ''' :param aspect_data: A list of dicts containing `span`, `target` and \ `sentiment` keys. :param text: The text of the sentence that is associated to all of the \ aspects in the aspect_data list :type aspect_data: list :type text: String :returns: The list of dicts in the aspect_data parameter but with a \ `text` key with the value that the text parameter contains :rtype: list ''' for data in aspect_data: data['text'] = text return aspect_data all_aspect_term_data = TargetCollection() for sentence in sentences: aspect_term_data = None text_index = None sentence_id = file_name + sentence.attrib['id'] # Allow the parser to skip certain sentences if sentence_ids_skip is not None: if sentence.attrib['id'] in sentence_ids_skip: continue for index, data in enumerate(sentence): if data.tag == 'sentence': raise Exception(sentence.attrib['id']) if data.tag == 'text': text_index = index elif data.tag == 'aspectTerms' or data.tag == 'Opinions': aspect_term_data = extract_aspect_terms(data, sentence_id) if aspect_term_data is None: continue if text_index is None: raise ValueError('A semeval sentence should always have text '\ 'semeval file {} sentence id {}'\ .format(file_name, sentence.attrib['id'])) sentence_text = sentence[text_index].text aspect_term_data = add_text(aspect_term_data, sentence_text) for aspect in aspect_term_data: sent_target = Target(**aspect) all_aspect_term_data.add(sent_target) return all_aspect_term_data
def test_target_constructor(self): ''' Test that target constructor ''' # Testing the spans types with self.assertRaises(TypeError, msg='Spans should be of type list'): Target('span', '1', 'Iphone', 'text with Iphone', 'Pos') with self.assertRaises(TypeError, msg='Spans should be list of tuples'): Target([1, 2], '1', 'Iphone', 'text with Iphone', 'Pos') with self.assertRaises(ValueError, msg='Spans should contain tuples of '\ 'length 2'): Target([(1, 2, 3), (3, 4, 5)], '1', 'Iphone', 'text with Iphone', 'Pos') with self.assertRaises(ValueError, msg='Spans should contain tuples of '\ 'length 2'): Target([(1, 2), (3, 4, 5)], '1', 'Iphone', 'text with Iphone', 'Pos') with self.assertRaises(TypeError, msg='Spans should contain tuples of '\ 'length 2 and are Ints'): Target([('1', '2')], '1', 'Iphone', 'text with Iphone', 'Pos') with self.assertRaises(TypeError, msg='Spans should contain tuples of '\ 'length 2 and are Ints'): Target([(1, '2')], '1', 'Iphone', 'text with Iphone', 'Pos') with self.assertRaises(TypeError, msg='Spans should contain tuples of '\ 'length 2 and are Ints'): Target([('1', 2)], '1', 'Iphone', 'text with Iphone', 'Pos') with self.assertRaises(ValueError, msg='Spans should contain tuples of '\ 'Ints where the first Int < second Int'): Target([(7, 5)], '1', 'Iphone', 'text with Iphone', 'Pos') with self.assertRaises(ValueError, msg='Spans should contain tuples of '\ 'Ints where the first Int < second Int'): Target([(3, 5), (6, 6)], '1', 'Iphone', 'text with Iphone', 'Pos') # Testing that the spans work in a normal case Target([(3, 5), (6, 8)], '1', 'Iphone', 'text with Iphone', 'Pos') # Testing the target ID type with self.assertRaises(TypeError, msg='Target ID should be a String'): Target([(3, 5), (6, 8)], 1, 'Iphone', 'text with Iphone', 'Pos') # Testing target type with self.assertRaises(TypeError, msg='Target should be a String'): Target([(3, 5), (6, 8)], '1', ('Iphone',), 'text with Iphone', 'Pos') # Testing text type with self.assertRaises(TypeError, msg='Text should be a String'): Target([(3, 5), (6, 8)], '1', 'Iphone', ('text with Iphone',), 'Pos') # Testing sentiment type with self.assertRaises(TypeError, msg='Sentiment should be a String or '\ 'Int'): Target([(3, 5), (6, 8)], '1', 'Iphone', 'text with Iphone', ('Pos',)) # Testing the sentence_id type with self.assertRaises(TypeError, msg='sentence_id should be a String'): Target([(3, 5), (6, 8)], '1', 'Iphone', 'text with Iphone', 'pos', sentence_id=1) # Testing the sentiment type works as an Integer (Normal case) span = [(3, 5)] target = 'Iphone' sentiment = 1 text = 'text with Iphone' target_id = '210#1' predicted = 0 sentence_id = '210' target_example = Target(span, target_id, target, text, sentiment, predicted, sentence_id) # Testing that the dictionary mapping is correct self.assertEqual(target_id, target_example['target_id'], msg='The target ID should {} and not {}'\ .format(target_id, target_example['target_id'])) self.assertEqual(text, target_example['text'], msg='The text should be {} and not {}'\ .format(text, target_example['text'])) self.assertEqual(sentiment, target_example['sentiment'], msg='The sentiment should be {} and not {}'\ .format(sentiment, target_example['sentiment'])) self.assertEqual(target, target_example['target'], msg='The target should be {} and not {}'\ .format(target, target_example['target'])) self.assertEqual(span, target_example['spans'], msg='The spans should be {} and not {}'\ .format(span, target_example['spans'])) self.assertEqual(predicted, target_example['predicted'], msg='The predicted sentiment should be {} and not {}'\ .format(predicted, target_example['predicted'])) self.assertEqual(sentence_id, target_example['sentence_id'], msg='The sentence_id should be {} and not {}'\ .format(sentence_id, target_example['sentence_id']))
def hu_liu(file_path): ''' Parser for the datasets from the following two papers (DOES NOT WORK): 1. `A Holistic Lexicon-Based Approach to Opinion Mining \ <https://www.cs.uic.edu/~liub/FBS/opinion-mining-final-WSDM.pdf>`_ 2. `Mining and Summarizing Customer Reviews \ <https://www.cs.uic.edu/~liub/publications/kdd04-revSummary.pdf>`_ Currently this does not work. This is due to the dataset not containing enough data to determine where the targets are in the text. :param file_path: The path to a file containing annotations in the format \ of hu and liu sentiment datasets. :type file_path: String :returns: A TargetCollection containing Target instances. :rtype: TargetCollection ''' file_path = os.path.abspath(file_path) file_name = os.path.basename(file_path) sentiment_data = TargetCollection() with open(file_path, 'r', encoding='cp1252') as annotations: for sentence_index, annotation in enumerate(annotations): # If it does not contain ## then not a sentence if '##' not in annotation: continue targets_text = annotation.split('##') if len(targets_text) > 2 or len(targets_text) < 1: raise ValueError('The annotation {} when split on `##` should '\ 'contain at least the sentence text and at'\ ' most the text and the targets and not {}'\ .format(annotation, targets_text)) # If it just contains the sentence text then go to next elif len(targets_text) == 1: continue elif targets_text[0].strip() == '': continue targets, text = targets_text targets = targets.strip() text = text.strip() sentence_id = file_name + '#{}'.format(sentence_index) targets = targets.split(',') for target_index, target in enumerate(targets): target = target.strip() sentiment_match = re.search(r'\[[+-]\d\]$', target) is_implicit = re.search(r'\[[up]\]', target) if is_implicit: print('Target {} is implicit {}'.format(target, text)) continue if not sentiment_match: raise ValueError('Target {} does not have a corresponding'\ ' sentiment value. annotation {}'\ .format(target, annotation)) target_text = target[:sentiment_match.start()].strip() sentiment_text = sentiment_match.group().strip().strip('[]') sentiment_value = int(sentiment_text) target_matches = list(re.finditer(target_text, text)) if len(target_matches) != 1: print('The Target {} can only occur once in the '\ 'text {}'.format(target_text, text)) continue raise ValueError('The Target {} can only occur once in the '\ 'text {}'.format(target_text, text)) target_span = target_matches[0].span() target_id = sentence_id + '#{}'.format(target_index) data_dict = {} data_dict['spans'] = [target_span] data_dict['target'] = target_text data_dict['sentiment'] = sentiment_value data_dict['text'] = text data_dict['sentence_id'] = sentence_id data_dict['target_id'] = target_id sentiment_data.add(Target(**data_dict)) return sentiment_data
def parse_tweet(tweet_data, anno_data, tweet_id): def get_offsets(entity, tweet_text, target): offset_shifts = [0, -1, 1] from_offset = entity['offset'] for offset_shift in offset_shifts: from_offset_shift = from_offset + offset_shift to_offset = from_offset_shift + len(target) offsets = [(from_offset_shift, to_offset)] offset_text = tweet_text[from_offset_shift : to_offset].lower() if offset_text == target.lower(): return offsets raise ValueError('Offset {} does not match target text {}. Full '\ 'text {}\nid {}'\ .format(from_offset, target, tweet_text, tweet_id)) def fuzzy_target_match(tweet_text, target): low_target = target.lower() target_searches = [low_target, r'[^\w]' + low_target, r'[^\w]' + low_target + r'[^\w]', low_target + r'[^\w]', low_target.replace(' ', ''), low_target.replace(" '", '')] for target_search in target_searches: target_matches = list(re.finditer(target_search, tweet_text.lower())) if len(target_matches) == 1: return target_matches if tweet_id in set(['81211671026352128', '78689580104290305', '81209490499960832']): return None if tweet_id == '75270720671973376' and target == 'kippers': return None if tweet_id == '65855178264686592' and target == 'tax': return None print(tweet_data) print(anno_data) raise ValueError('Cannot find the exact additional '\ 'entity {} within the tweet {}'\ .format(target, tweet_text)) target_instances = [] tweet_id = str(tweet_id) tweet_text = tweet_data['content'] target_ids = [] # Parse all of the entities that have been detected automatically for entity in tweet_data['entities']: data_dict = {} target = entity['entity'] target_ids.append(entity['id']) entity_id = str(entity['id']) data_dict['spans'] = get_offsets(entity, tweet_text, target) data_dict['target'] = entity['entity'] data_dict['target_id'] = folder_name + tweet_id + '#' + entity_id data_dict['sentence_id'] = folder_name + tweet_id data_dict['sentiment'] = anno_data['items'][entity_id] if data_dict['sentiment'] == 'doesnotapply' and not include_dnr: continue # Convert from Strings to Integer data_dict['sentiment'] = sentiment_mapper[data_dict['sentiment']] data_dict['text'] = tweet_text target_instances.append(Target(**data_dict)) # Parse all of the entities that have been selected by the user if include_additional: additional_data = anno_data['additional_items'] if isinstance(additional_data, dict): for target, sentiment in additional_data.items(): target_matches = fuzzy_target_match(tweet_text, target) if target_matches is None: continue target_id = max(target_ids) + 1 target_ids.append(target_id) data_dict['spans'] = [target_matches[0].span()] data_dict['target'] = target data_dict['sentiment'] = sentiment data_dict['text'] = tweet_text data_dict['sentence_id'] = tweet_id data_dict['target_id'] = tweet_id + '#' + str(target_id) target_instances.append(Target(**data_dict)) return target_instances
def test_dependency_context(self): ''' Tests dependency_context ''' # Test the multiple span cases test_values = [{'target_id':str(0), 'sentiment':-1, 'text':'This is a fake news articledd that is to represent a '\ 'Tweet!!!! and it was an awful News Articless I think.', 'target':'news article', 'spans':[(15, 27), (83, 95)]}, {'target_id':str(1), 'sentiment':1, 'text':'I had a great Day however I did not get much '\ 'work done in the days', 'target':'day', 'spans':[(14, 17), (62, 65)]}, {'spans': [(1, 14), (15, 29)], 'target_id': '8', 'target': 'britney spears', 'text': "#britneyspears Britney Spears 's new single "\ "-3' debuts at #1: video: congratulations are "\ "in order ..", 'sentiment': 0}, {'spans': [(5, 14)], 'target_id': '9', 'target': 'wait time', 'text': "less wait time for me!", 'sentiment': 0}, {'spans': [(81, 91)], 'target_id': '9', 'target': '@RichardWS', 'text': "Lower taxes rendered null & void with the "\ "planned increase in VAT by the tories. "\ "@RichardWS #battlefornumber10 @RichardWS", 'sentiment': 0}, {'spans': [(46, 57)], 'target_id': '9', 'target': '@Shaun_XL5:', 'text': "< My main #GE2015 policy worry is now defence "\ "@Shaun_XL5: I think we need Nato target of 2% "\ "GDP with hardcore auditing @paullewismoney", 'sentiment': 0}, {'spans': [(102, 119)], 'target_id': '9', 'target': 'first-time buyers', 'text': "Despite spin, no new government money for "\ "#housing: transferring resources from affordable"\ " renting to first-time buyers. #GE2015 #GE15", 'sentiment': 0}, {'spans': [(130, 140)], 'target_id': '9', 'target': '@UNICEF_uk!', 'text': "Violence against children around the world is "\ "like a hidden epidemic. Michael Sheen talking "\ "well abt SDGs on #marrshow Nice work @UNICEF_uk!", 'sentiment': 0}, {'spans': [(21, 32)], 'target_id': '9', 'target': 'Tony #Blair', 'text': "Corrupt war criminal Tony #Blair's rats are "\ "running amok inside Labour today. If I was "\ "thinking of voting Labour today. I'd stop. #GE15", 'sentiment': 0}, {'spans': [(39, 49)], 'target_id': '9', 'target': '@UKLabour;', 'text': "Day 1 of #GE2015 & I'm already sick of "\ "@UKLabour; their lies & their forgetfulness. "\ "Whose policies created the need for austerity?"\ " Yes. Lab!", 'sentiment': 0}, {'spans': [(33, 46)], 'target_id': '9', 'target': '@GrantShapps’', 'text': "Lest Cameron forgets: Police say @GrantShapps’ "\ "firm sales “may constitute offence of fraud” "\ "#bbcdp #pmqs : http://t.co/Gu9Ke6sRtX", 'sentiment': 0}] valid_results = [[{'text' : 'This is a fake news article', 'span' : (15, 27)}, {'text' : 'dd that is to represent a Tweet and it was '\ 'an awful news article', 'span' : (52, 64)}], [{'text' : 'I had a great day however I did not get '\ 'much work done in the day', 'span' : (14, 17)}, {'text' : 'I had a great day however I did not get '\ 'much work done in the day', 'span' : (62, 65)}], [{'text' : "britney spears britney spears", 'span' : (0, 14)}, {'text' : "britney spears britney spears", 'span' : (15, 29)}], [{'text' : 'less wait time for me', 'span' : (5, 14)}], [{'text' : '@RichardWS', 'span' : (0, 10)}], [{'text' : "My main #GE2015 policy worry is now defence"\ " @Shaun_XL5:", 'span' : (44, 55)}], [{'text' : "transferring resources from affordable "\ "renting to first-time buyers", 'span' : (50, 67)}], [{'text' : "@UNICEF_uk!", 'span' : (0, 11)}], [{'text' : "criminal Tony #Blair", 'span' : (9, 20)}], [{'text' : "Day 1 of #GE2015 & I'm already sick of "\ "@UKLabour; their lies & their forgetfulness", 'span' : (39, 49)}], [{'text' : "Lest Cameron forgets Police say "\ "@GrantShapps’ firm sales", 'span' : (32, 45)}]] test_values = [Target(**test_value) for test_value in test_values] test_results = dependency_context(test_values, tweebo) for index, valid_result in enumerate(valid_results): test_result = test_results[index] for dict_index, valid_dict in enumerate(valid_result): test_dict = test_result[dict_index] self.assertEqual(valid_dict['text'], test_dict['text'], msg='texts are different correct `{}` test `{}`'\ .format(valid_dict['text'], test_dict['text'])) self.assertEqual(valid_dict['span'], test_dict['span'], msg='spans are different correct `{}` test `{}`'\ ' text `{}`'.format(valid_dict['span'], test_dict['span'], test_dict['text'])) # Test the lower casing case of the text and if the target is upper case test_values = [{'target_id':str(0), 'sentiment':-1, 'text':'This is a fake news articledd that is to represent a '\ 'Tweet!!!! and it was an awful News Articless I think.', 'target':'news article', 'spans':[(15, 27), (83, 95)]}, {'target_id':str(0), 'sentiment':-1, 'text':'This is a fake news articledd that is to represent a '\ 'Tweet!!!! and it was an awful News Articless I think.', 'target':'News Article', 'spans':[(15, 27), (83, 95)]}, {'target_id':str(1), 'sentiment':1, 'text':'I had a great Day however I did not get much '\ 'work done in the days', 'target':'day', 'spans':[(14, 17), (62, 65)]}] valid_results = [[{'text' : 'this is a fake news article', 'span' : (15, 27)}, {'text' : 'dd that is to represent a tweet and it was '\ 'an awful news article', 'span' : (52, 64)}], [{'text' : 'this is a fake news article', 'span' : (15, 27)}, {'text' : 'dd that is to represent a tweet and it was '\ 'an awful news article', 'span' : (52, 64)}], [{'text' : 'i had a great day however i did not get '\ 'much work done in the day', 'span' : (14, 17)}, {'text' : 'i had a great day however i did not get '\ 'much work done in the day', 'span' : (62, 65)}]] test_values = [Target(**test_value) for test_value in test_values] test_results = dependency_context(test_values, tweebo, lower=True) for index, valid_result in enumerate(valid_results): test_result = test_results[index] for dict_index, valid_dict in enumerate(valid_result): test_dict = test_result[dict_index] self.assertEqual(valid_dict['text'], test_dict['text'], msg='texts are different correct `{}` test `{}`'\ .format(valid_dict['text'], test_dict['text'])) self.assertEqual(valid_dict['span'], test_dict['span'], msg='spans are different correct `{}` test `{}`'\ .format(valid_dict['span'], test_dict['span'])) # Test the case where the target is mentioned twice but only 1 is relevant # to one of the mentions test_values = [{'target_id':str(1), 'sentiment':1, 'text':'I had a great Day however I did not get much '\ 'work done in the day', 'target':'day', 'spans':[(14, 17)]}, {'target_id':str(3), 'sentiment':1, 'text':'I had a great Day however I did not get much '\ 'work done in the Day', 'target':'day', 'spans':[(14, 17)]}, {'spans': [(47, 80)], 'target_id': '2', 'target': 'Core Processing Unit temperatures', 'text': 'Temperatures were ok but I was not tracking'\ ' in Core Processing Unit temperatures.', 'sentiment': 0}, {'text' : 'I also recommend the rice dishes or the '\ 'different varieties of congee (rice porridge).', 'spans' : [(63, 85)], 'target_id' : '4', 'target' : 'congee (rice porridge)', 'sentiment' : 1}] valid_results = [[{'text' : 'I had a great day however I did not get '\ 'much work done in the day', 'span' : (14, 17)}], [{'text' : 'I had a great day however I did not get '\ 'much work done in the Day', 'span' : (14, 17)}], [{'text' : 'Temperatures were ok but I was not tracking'\ ' in Core Processing Unit temperatures', 'span' : (47, 80)}], [{'text' : 'I also recommend the rice dishes or the '\ 'different varieties of congee (rice '\ 'porridge)', 'span' : (63, 85)}]] test_values = [Target(**test_value) for test_value in test_values] test_results = dependency_context(test_values, tweebo) for index, valid_result in enumerate(valid_results): test_result = test_results[index] for dict_index, valid_dict in enumerate(valid_result): test_dict = test_result[dict_index] self.assertEqual(valid_dict['text'], test_dict['text'], msg='texts are different correct `{}` test `{}`'\ .format(valid_dict['text'], test_dict['text'])) self.assertEqual(valid_dict['span'], test_dict['span'], msg='spans are different correct `{}` test `{}`'\ .format(valid_dict['span'], test_dict['span']))
def test_targetcoll_sent_data(self): ''' Test the sentiment_data function of TargetCollection ''' target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone', 'text with Iphone', 1) target_example_1 = Target([(1, 5)], '3', 'Iphone', 'text with Iphone', 1) target_example_2 = Target([(1, 2)], '2', 'Iphone', 'text with Iphone', -1) target_col_int = TargetCollection([target_example_0, target_example_1, target_example_2]) target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone', 'text with Iphone', 'pos') target_example_1 = Target([(1, 5)], '3', 'Iphone', 'text with Iphone', 'pos') target_example_2 = Target([(1, 2)], '2', 'Iphone', 'text with Iphone', 'neg') target_col_str = TargetCollection([target_example_0, target_example_1, target_example_2]) # Testing the basic example test_sentiments = target_col_int.sentiment_data() valid_sentiments = [1, 1, -1] self.assertEqual(valid_sentiments, test_sentiments, msg='The Integer '\ 'sentiments returned should be {} and not {}'\ .format(valid_sentiments, test_sentiments)) test_sentiments = target_col_str.sentiment_data() valid_sentiments = ['pos', 'pos', 'neg'] self.assertEqual(valid_sentiments, test_sentiments, msg='The String '\ 'sentiments returned should be {} and not {}'\ .format(valid_sentiments, test_sentiments)) # Testing the mapping function str_mapper = {'pos' : 1, 'neg' : -1} test_sentiments = target_col_str.sentiment_data(mapper=str_mapper) valid_sentiments = [1, 1, -1] self.assertEqual(valid_sentiments, test_sentiments, msg='The String '\ 'sentiments should be mapped to Integers. Valid {} '\ 'not {}'.format(valid_sentiments, test_sentiments)) int_mapper = {1 : 'pos', -1 : 'neg'} test_sentiments = target_col_int.sentiment_data(mapper=int_mapper) valid_sentiments = ['pos', 'pos', 'neg'] self.assertEqual(valid_sentiments, test_sentiments, msg='The Integer '\ 'sentiments should be mapped to String. Valid {} '\ 'not {}'.format(valid_sentiments, test_sentiments)) with self.assertRaises(TypeError, msg='Should only accept dict mapper'): target_col_int.sentiment_data(mapper=[(1, 'pos'), (-1, 'neg')]) with self.assertRaises(ValueError, msg='Mapper should refuse dicts that'\ ' may have valid mappings but not all the mappings'): target_col_int.sentiment_data(mapper={1 : 'pos'}) with self.assertRaises(ValueError, msg='Mapper should refuse dicts that'\ ' contain the correct number of mappings but not '\ 'the correct mappings'): target_col_int.sentiment_data({0 : 'pos', -1 : 'neg'}) with self.assertRaises(ValueError, msg='Mapper should refuse dicts that '\ 'have all the correct mappings but contain some '\ 'in-correct mappings'): target_col_int.sentiment_data(mapper={1 : 'pos', -1 : 'neg', 0 : 'neu'}) # Testing the sentiment_field parameter target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone', 'text with Iphone', 'pos', 'neg') target_example_1 = Target([(1, 5)], '3', 'Iphone', 'text with Iphone', 'pos', 'neu') target_example_2 = Target([(1, 2)], '2', 'Iphone', 'text with Iphone', 'neg', 'pos') target_col = TargetCollection([target_example_0, target_example_1, target_example_2]) test_sentiments = target_col.sentiment_data(sentiment_field='predicted') valid_sentiments = ['neg', 'neu', 'pos'] self.assertEqual(valid_sentiments, test_sentiments, msg='The predicted '\ 'sentiments returned should be {} and not {}'\ .format(valid_sentiments, test_sentiments))
def test_dependency_relation_context(self): ''' Tests dependency_relation_context ''' # Test the normalise case test_values = [{'target_id':str(0), 'sentiment':-1, 'text':'This is a fake news articledd that is to represent a '\ 'Tweet!!!! and it was an awful News Articless I think.', 'target':'news article', 'spans':[(15, 27), (83, 95)]}, {'target_id':str(1), 'sentiment':1, 'text':'I had a great Day however I did not get much '\ 'work done in the days', 'target':'day', 'spans':[(14, 17), (62, 65)]}, {'target_id':str(2), 'sentiment':1, 'text':'this is an alternative sentence to see how it '\ 'performs', 'target':'sentence', 'spans':[(23, 31)]}, {'target_id':str(3), 'sentiment':1, 'text':'Teen pop star heartthrobe is all the rage on '\ 'social media', 'target':'teen', 'spans':[(0, 4)]}] valid_results = [['a fake', 'an awful'], ['a great', 'the'], ['an alternative to see performs it how'], ['']] test_values = [Target(**test_value) for test_value in test_values] test_results = dependency_relation_context(test_values, tweebo, n_relations=(1, -1)) for index, valid_result in enumerate(valid_results): test_result = test_results[index] self.assertEqual(valid_result, test_result, msg='Incorrect context'\ ' correct {} test {}'.format(valid_result, test_result)) # Testing when we only want the first dependency relation valid_results = [['a fake', 'an awful'], ['a great', 'the'], ['an alternative to'], ['']] test_results = dependency_relation_context(test_values, tweebo) for index, valid_result in enumerate(valid_results): test_result = test_results[index] self.assertEqual(valid_result, test_result, msg='Incorrect context'\ ' correct {} test {}'.format(valid_result, test_result)) # Testing to ensure it will lower case the words before processing valid_results = [['a fake', 'an awful'], ['a great', 'the'], ['an alternative to see performs it how'], ['']] test_results = dependency_relation_context(test_values, tweebo, True, (1, -1)) for index, valid_result in enumerate(valid_results): test_result = test_results[index] self.assertEqual(valid_result, test_result, msg='Incorrect context'\ ' correct {} test {}'.format(valid_result, test_result)) # Testing for when a sentence mentions the target more than once but we # are only interested in the first mention test_values = [{'target_id':str(1), 'sentiment':1, 'text':'I had a great Day however I did not get much '\ 'work done in the day', 'target':'day', 'spans':[(14, 17)]}] valid_results = [['a great']] test_values = [Target(**test_value) for test_value in test_values] test_results = dependency_relation_context(test_values, tweebo, n_relations=(1, -1)) for index, valid_result in enumerate(valid_results): test_result = test_results[index] self.assertEqual(valid_result, test_result, msg='Incorrect context'\ ' for more than one mention correct {} test {}'\ .format(valid_result, test_result))
def test_target_coll_subset_by_sent(self): ''' Test the subset_by_sentiment function of TargetCollection ''' target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone', 'text with Iphone', 'pos', sentence_id='4') target_example_1 = Target([(1, 5)], '3', 'Iphone', 'text with Iphone', 'neg', sentence_id='4') target_example_2 = Target([(1, 2)], '2', 'Iphone', 'text with Iphone', 'neg', sentence_id='5') target_example_3 = Target([(1, 2)], '4', 'Iphone', 'text with Iphone', 'neg', sentence_id='5') target_example_4 = Target([(1, 2)], '5', 'Iphone', 'text with Iphone', 'pos', sentence_id='6') target_example_5 = Target([(1, 2)], '6', 'Iphone', 'text with Iphone', 'neu', sentence_id='6') target_example_6 = Target([(1, 2)], '7', 'Iphone', 'text with Iphone', 'neg', sentence_id='6') target_example_7 = Target([(1, 2)], '8', 'Iphone', 'text with Iphone', 'neg', sentence_id='7') target_example_8 = Target([(1, 2)], '9', 'Iphone', 'text with Iphone', 'neg', sentence_id='8') target_example_9 = Target([(1, 2)], '10', 'Iphone', 'text with Iphone', 'neg', sentence_id='8') target_example_10 = Target([(1, 2)], '11', 'Iphone', 'text with Iphone', 'pos', sentence_id='8') all_targets = [target_example_0, target_example_1, target_example_2, target_example_3, target_example_4, target_example_5, target_example_6, target_example_7, target_example_8, target_example_9, target_example_10] target_col = TargetCollection(all_targets) # Test for 2 unique sentiments per sentence test_col = target_col.subset_by_sentiment(2) valid_col = TargetCollection([target_example_0, target_example_1, target_example_8, target_example_9, target_example_10]) self.assertEqual(valid_col, test_col, msg='Should only return these {}'\ ' but has returned this {}'.format(valid_col, test_col)) # Test for 1 unique sentiments per sentence test_col = target_col.subset_by_sentiment(1) valid_col = TargetCollection([target_example_7, target_example_2, target_example_3]) self.assertEqual(valid_col, test_col, msg='Should only return these {}'\ ' but has returned this {}'.format(valid_col, test_col)) # Test for 3 unique sentiments per sentence test_col = target_col.subset_by_sentiment(3) valid_col = TargetCollection([target_example_4, target_example_5, target_example_6]) self.assertEqual(valid_col, test_col, msg='Should only return these {}'\ ' but has returned this {}'.format(valid_col, test_col))