Пример #1
0
    def test_target_set(self):
        '''
        Tests Target set function
        '''

        # Test normal cases
        target_example_int = Target([(3, 5), (6, 8)], '1', 'Iphone',
                                    'text with Iphone', 1)
        predicted_sentiment = 0
        target_example_int['predicted'] = 0
        self.assertEqual(predicted_sentiment, target_example_int['predicted'],
                         msg='Predicted sentiment value should be 0 and '\
                         'not {}'.format(target_example_int['predicted']))

        target_example_string = Target([(3, 5), (6, 8)], '1', 'Iphone',
                                       'text with Iphone', 'pos')
        predicted_sentiment = 'neg'
        target_example_string['predicted'] = 'neg'
        self.assertEqual(predicted_sentiment, target_example_string['predicted'],
                         msg='Predicted sentiment value should be `neg` and '\
                         'not {}'.format(target_example_string['predicted']))

        # Testing the errors
        with self.assertRaises(KeyError, msg='Should not allow you to set keys '\
                               'other than `predicted`'):
            target_example_int['sentiment'] = 0
Пример #2
0
    def test_targetcoll_get(self):
        '''
        Test the __getitem__ function of TargetCollection
        '''

        target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone',
                                  'text with Iphone', 1)
        target_example_1 = Target([(1, 5)], '3', 'Iphone',
                                  'text with Iphone', 1)
        target_example_2 = Target([(1, 2)], '2', 'Iphone',
                                  'text with Iphone', 1)
        target_col = TargetCollection([target_example_0, target_example_1,
                                       target_example_2])

        # Test normal case
        self.assertEqual(target_example_2, target_col['2'], msg='Cannot access '\
                         'data using keys. key used {} collection {}'\
                         .format('2', target_col))
        self.assertEqual(target_example_1, target_col.get('3'), msg='Cannot '\
                         'access data using the get method.')
        self.assertEqual(None, target_col.get('5'), msg='Default value for '\
                         'get not working should be None not {}'\
                         .format(target_col.get('5')))
        # Test that it raises a KeyError when key does not exist
        with self.assertRaises(KeyError, msg='Should produce a key error when '\
                               'the data does not exist'):
            target_col['5']
Пример #3
0
    def test_targetcoll_add(self):
        '''
        Test the add function of TargetCollection
        '''

        target_col = TargetCollection()
        target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone',
                                  'text with Iphone', 1)
        target_example_1 = Target([(1, 5)], '3', 'Iphone',
                                  'text with Iphone', 1)
        # Ensure the normal case works
        target_col.add(target_example_0)
        self.assertEqual(target_col['1'], target_example_0, msg='Test that {}' \
                         ' has been added to {}'\
                         .format(target_example_0, target_col))

        with self.assertRaises(TypeError, msg='Should not be able to add a dict'):
            target_col.add({'target_id' : '2'})

        with self.assertRaises(ValueError, msg='Should not be able to add a '\
                               'Target that has no `id`'):
            del target_example_1['target_id']
            if 'target_id' in target_example_1:
                raise KeyError('{} should not contain `id` key'\
                .format(target_example_1))
            target_col.add(target_example_1)
Пример #4
0
    def test_targetcoll_constructor(self):
        '''
        Tests TargetCollection constructor
        '''

        target_example = Target([(3, 5), (6, 8)], '1', 'Iphone',
                                'text with Iphone', 1)
        # Test that it will not accept anything but Target instances
        with self.assertRaises(TypeError, msg='The constructor should only '\
                               'accept an interator as an argument'):
            TargetCollection(1)
        with self.assertRaises(TypeError, msg='The constructor should only '\
                               'accept an interator of Target instances'):
            TargetCollection([1, 2, 3, 4])
        # Should accept the following without any problems
        TargetCollection([])
        TargetCollection([target_example])
        TargetCollection()

        # Testing the case where the list of Targets contains duplicate keys
        another_example = Target([(3, 4)], '2', 'Keys',
                                 'text with Keys', -1)
        dup_example = Target([(3, 10)], '1', 'Pixel',
                             'text with Pixels', 0)
        with self.assertRaises(KeyError, msg='Should raise an error as two of '\
                               'the target instances have the same key'):
            TargetCollection([target_example, another_example, dup_example])
Пример #5
0
    def test_target_eq(self):
        '''
        Test the Target __eq__ method
        '''

        target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone',
                                  'text with Iphone', 1)
        target_example_1 = Target([(1, 5)], '3', 'Iphone',
                                  'text with Iphone', 1)
        target_example_2 = Target([(1, 2)], '2', 'Iphone',
                                  'text with Iphone', 1)
        target_example_dup0 = Target([(1, 2)], '1', 'S8',
                                     'text with Samsung S8', 1)
        # Targets with the same ID should be True even if they have different
        # data
        self.assertEqual(target_example_0, target_example_dup0,
                         msg='Should be equal as they have the same ID `1`')
        # Targets with the same minimum keys should be True
        target_example_dup1 = copy.deepcopy(target_example_1)
        del target_example_dup1['target_id']
        self.assertEqual(target_example_1, target_example_dup1, msg='Should be'\
                         ' equal as they have the same minimum keys')
        # Normal case
        target_example_dup2 = copy.deepcopy(target_example_2)
        self.assertEqual(target_example_2, target_example_dup2, msg='Copies of'\
                         ' the same Target instance should be equal')
        # Test that it won't accept dicts with the same minimum keys
        dict_example_1 = {'target' : 'Iphone', 'sentiment' : 1,
                          'spans' : [(1, 5)], 'text' : 'text with Iphone'}
        test_equality = dict_example_1 == target_example_1
        self.assertEqual(False, test_equality, msg='Should not accept dicts '\
                         'even with the same minimum_keys')
Пример #6
0
    def extract_targets(current_target, end_span, start_span, targets,
                        target_spans, target_index, tweet_text, sentiment_data,
                        tweet_id, target_sentiments):
        if current_target != []:
            target_word = ' '.join(current_target)
            end_span = start_span + len(target_word)
            targets.append(target_word)
            target_spans.append((start_span, end_span))
            start_span, end_span = None, None
            current_target = []
            target_index += 1
        tweet_text = ' '.join(tweet_text)
        for index, target in enumerate(targets):
            target_id = '{}#{}'.format(tweet_id, index)
            target_sentiment = target_sentiments[index]
            target_span = target_spans[index]
            if tweet_text[target_span[0] : target_span[1]] != target:
                raise Exception('The target span {} does not match the '\
                                'target word {} in {}'\
                                .format(target_span, target, tweet_text))
            target_data = {'spans' : [target_span], 'target_id' : target_id,
                           'target' : target, 'text' : tweet_text,
                           'sentiment' : target_sentiment,
                           'sentence_id' : tweet_id}
            target_data = Target(**target_data)
            sentiment_data.add(target_data)

        return sentiment_data
Пример #7
0
    def test_targetcoll_stored_sent(self):
        '''
        Test the stored_sentiments function of TargetCollection
        '''

        target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone',
                                  'text with Iphone', 1)
        target_example_1 = Target([(1, 5)], '3', 'Iphone',
                                  'text with Iphone', 1)
        target_example_2 = Target([(1, 2)], '2', 'Iphone',
                                  'text with Iphone', -1)
        target_col = TargetCollection([target_example_0, target_example_1,
                                       target_example_2])
        valid_sentiments = set([1, -1])
        test_sentiments = target_col.stored_sentiments()
        self.assertEqual(valid_sentiments, test_sentiments, msg='The unique '\
                         'sentiments in the TargetCollection should be {} and '\
                         'not {}'.format(valid_sentiments, test_sentiments))
Пример #8
0
def dong(file_path):
    '''
    Given file path to the
    `Li Dong <https://github.com/bluemonk482/tdparse/tree/master/data/lidong>`_
    sentiment data it will parse the data and return it as a list of dictionaries.

    :param file_path: File Path to the annotated data
    :type file_path: String
    :returns: A TargetCollection containing Target instances.
    :rtype: TargetCollection
    '''

    file_path = os.path.abspath(file_path)
    if not os.path.isfile(file_path):
        raise FileNotFoundError('This file does not exist {}'.format(file_path))
    file_name, _ = os.path.splitext(os.path.basename(file_path))
    sentiment_range = [-1, 0, 1]

    sentiment_data = TargetCollection()
    with open(file_path, 'r') as dong_file:
        sent_dict = {}
        for index, line in enumerate(dong_file):
            divisible = index + 1
            line = line.strip()
            if divisible % 3 == 1:
                sent_dict['text'] = line
            elif divisible % 3 == 2:
                sent_dict['target'] = line
            elif divisible % 3 == 0:
                sentiment = int(line)
                if sentiment not in sentiment_range:
                    raise ValueError('The sentiment has to be one of the '\
                                     'following values {} not {}'\
                                     .format(sentiment_range, sentiment))
                sent_dict['sentiment'] = int(line)
                text = sent_dict['text'].lower()
                target = sent_dict['target'].lower()
                offsets = [match.span() for match in re.finditer(target, text)]
                if len(target.split()) > 1:
                    joined_target = ''.join(target.split())
                    offsets.extend([match.span()
                                    for match in re.finditer(joined_target, text)])
                sent_dict['spans'] = offsets
                sent_id = file_name + str(len(sentiment_data))
                # Sentence ID is the same as the target as there is only one
                # target per sentence
                sent_dict['sentence_id'] = sent_id
                sent_dict['target_id'] = sent_id
                sent_target = Target(**sent_dict)
                sentiment_data.add(sent_target)
                sent_dict = {}
            else:
                raise Exception('Problem')
    return sentiment_data
Пример #9
0
    def test_targetcoll_add_preds(self):
        '''
        Tests the add_pred_sentiment function of TargetCollection
        '''

        target_example_int_0 = Target([(3, 5), (6, 8)], '1', 'Iphone',
                                  'text with Iphone', 1)
        target_example_int_1 = Target([(1, 5)], '3', 'Iphone',
                                  'text with Iphone', 1)
        target_example_int_2 = Target([(1, 2)], '2', 'Iphone',
                                  'text with Iphone', -1)
        target_col_int = TargetCollection([target_example_int_0,
                                           target_example_int_1,
                                           target_example_int_2])

        target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone',
                                  'text with Iphone', 'pos')
        target_example_1 = Target([(1, 5)], '3', 'Iphone',
                                  'text with Iphone', 'pos')
        target_example_2 = Target([(1, 2)], '2', 'Iphone',
                                  'text with Iphone', 'neg')
        target_col_str = TargetCollection([target_example_0, target_example_1,
                                           target_example_2])

        pred_sents = [2, 5, 4]
        target_col_int.add_pred_sentiment(pred_sents)
        self.assertEqual(5, target_col_int['3']['predicted'], msg='Predicted '\
                         'sentiment not set correctly for `id` 3 should be 5 '\
                         'and not {}'.format(target_col_int))
        with self.assertRaises(KeyError, msg='The original Target instances '\
                               'predicted sentiment should not be set as the '\
                               'TargetCollection should have copied them.'):
            target_example_int_1['predicted']

        pred_sents = ['neg', 'neu', 'pos']
        target_col_str.add_pred_sentiment(pred_sents)
        self.assertEqual('neu', target_col_str['3']['predicted'], msg='Predicted '\
                         'sentiment not set correctly for `id` 3 should be `neu` '\
                         'and not {}'.format(target_col_str))
        pred_sents = [1, 0, -1]
        target_col_str.add_pred_sentiment(pred_sents, mapper={1 : 'pos', 0 : 'neu',
                                                         -1 : 'neg'})
        self.assertEqual('neu', target_col_str['3']['predicted'], msg='Predicted '\
                         'sentiment not set correctly for `id` 3 should be `neu` '\
                         'and not {} using the mapper'.format(target_col_str))

        with self.assertRaises(KeyError, msg='The original Target instances '\
                               'predicted sentiment should not be set as the '\
                               'TargetCollection should have copied them.'):
            target_example_1['predicted']

        #with self.assertRaises(TypeError, msg='Should only accept list type '\
        #                       'not tuples'):
        #    target_col_int.add_pred_sentiment((2, 5, 4))
        with self.assertRaises(ValueError, msg='Should accept lists that are '\
                               ' the same size as the TargetCollection'):
            target_col_int.add_pred_sentiment([1, 2, 3, 4])
Пример #10
0
    def test_targetcoll_data(self):
        '''
        Test the data function of TargetCollection
        '''

        target_col = TargetCollection()
        target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone',
                                  'text with Iphone', 1)
        target_example_1 = Target([(1, 5)], '3', 'Iphone',
                                  'text with Iphone', 1)
        target_col.add(target_example_0)
        target_col.add(target_example_1)

        all_data = target_col.data()
        self.assertEqual(target_example_0, all_data[0], msg='First data '\
                         'returned should be the first inserted {} and not '\
                         '{}'.format(target_example_0, all_data[0]))
        self.assertEqual(target_example_1, all_data[1], msg='Second data '\
                         'returned should be the second inserted {} and not '\
                         '{}'.format(target_example_1, all_data[1]))

        target_example_2 = Target([(1, 2)], '2', 'Iphone',
                                  'text with Iphone', 1)
        del target_col['1']
        target_col.add(target_example_2)
        all_data = target_col.data()
        self.assertEqual(target_example_1, all_data[0], msg='First data '\
                         'returned should be the second inserted {} and not '\
                         '{} as the first has been removed'\
                         .format(target_example_1, all_data[0]))
        self.assertEqual(target_example_2, all_data[1], msg='Second data '\
                         'returned should be the third inserted {} and not '\
                         '{} as the first has been removed'\
                         .format(target_example_2, all_data[1]))
        self.assertEqual(2, len(all_data), msg='The length of the data returned'\
                         'shoudl be 2 and not {}'.format(len(all_data)))
def generate_stats(data_path: Path) -> Dict[str, Union[int, float]]:
    target_data = []
    with data_path.open('r') as data_lines:
        for line in data_lines:
            line = json.loads(line)
            line['spans'] = [tuple(span) for span in line['spans']]
            target_data.append(Target(**line))
    target_data = TargetCollection(target_data)
    target_stats = defaultdict(lambda: 0)
    data_size = len(target_data)
    target_stats['size'] = data_size
    for i in range(1, 3):
        target_stats[f'Distinct sentiment {i}'] = len(
            target_data.subset_by_sentiment(i))
    for data in target_data.data_dict():
        target_stats[data['sentiment']] += 1
    for key, value in target_stats.items():
        if key == 'size':
            continue
        target_stats[key] = value / data_size
    return target_stats
Пример #12
0
    def test_targetcoll_set(self):
        '''
        Test the __setitem__ function of TargetCollection
        '''

        target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone',
                                  'text with Iphone', 1)
        target_example_1 = Target([(1, 5)], '3', 'Iphone',
                                  'text with Iphone', 1)
        target_example_2 = Target([(1, 2)], '2', 'Iphone',
                                  'text with Iphone', 1)
        target_col = TargetCollection([target_example_0, target_example_1,
                                       target_example_2])
        target_example_3 = Target([(2, 4)], '5', 'new', 'new text', 0)
        target_example_4 = Target([(1, 3)], '6', 'another', 'another text', 1)
        target_example_5 = Target([(1, 3)], '7', 'another', 'another text', 1)
        target_diff_1 = Target([(4, 5)], '3', 'test', 'test text', 0)

        # Normal case adding a new value
        target_col['5'] = target_example_3
        self.assertEqual(target_col['5'], target_example_3, msg='Cannot add '\
                         'new value. store {} value added {}'\
                         .format(target_col, target_example_3))
        # If key already exists it cannot be added
        with self.assertRaises(KeyError, msg='Should not be able to add value '\
                               '{} as its key {} already exists {}'\
                               .format(target_diff_1, '3', target_col)):
            target_col['3'] = target_diff_1
        with self.assertRaises(KeyError, msg='Value with a different `id` to '\
                               'the key should fail. Key {} Value {}'\
                               .format('7', target_example_4)):
            target_col['7'] = target_example_4
        # Should accept Target instance with no `id`
        del target_example_5['target_id']
        if 'target_id' in target_example_5:
            raise KeyError('{} should not contain `id` key'\
            .format(target_example_5))
        target_col['8'] = target_example_5
Пример #13
0
    def test_target_normlisation(self):
        '''
        Tests target_normalisation
        '''

        with self.assertRaises(TypeError, msg='target_dict parameter has to '\
                               'be of type dict only'):
            target_normalisation(['anything'])

        test_values = [{'target_id':str(0),
                        'sentiment':-1,
                        'text':'This is a fake news articledd that is to represent a '\
                        'Tweet!!!! and it was an awful News Articless I think.',
                        'target':'news article',
                        'spans':[(15, 27), (83, 95)]},
                       {'target_id':str(1),
                        'sentiment':1,
                        'text':'I had a great ssDay however I did not get much '\
                        'work done in the days',
                        'target':'day',
                        'spans':[(16, 19), (64, 67)]},
                       {'target_id':str(2),
                        'sentiment':1,
                        'text':'I had a great ssDay however I did not get much '\
                        'work done in the days',
                        'target':'day',
                        'spans':[(16, 19)]},
                       {'target_id':str(3),
                        'sentiment':1,
                        'text':'Day however I did not get much done',
                        'target':'day',
                        'spans':[(0, 3)]},
                       {'target_id':str(4),
                        'sentiment':1,
                        'text':'however I did not get much done in the day',
                        'target':'day',
                        'spans':[(39, 42)]},
                       {'spans': [(47, 80)],
                        'target_id': '2',
                        'target': 'Core Processing Unit temperatures',
                        'text': 'Temperatures were ok but I was not tracking'\
                                ' in Core Processing Unit temperatures.',
                        'sentiment': 0},
                       {'spans': [(1, 14), (15, 29)],
                        'target_id': '8',
                        'target': 'britney spears',
                        'text': "#britneyspears Britney Spears 's new single "\
                                "-3' debuts at #1: video: congratulations are "\
                                "in order ..",
                        'sentiment': 0}]
        valid_results = [('This is a fake news_article dd that is to represent '\
                          'a Tweet!!!! and it was an awful news_article ss I '\
                          'think.', 'news_article'),
                         ('I had a great ss day however I did not get much work'\
                          ' done in the day s', 'day'),
                         ('I had a great ss $day$ however I did not get much '\
                          'work done in the days', '$day$'),
                         ('day however I did not get much done', 'day'),
                         ('however I did not get much done in the day',
                          'day'),
                         ('Temperatures were ok but I was not tracking in '\
                          'Core_ProcessingUnittemperatures .',
                          'Core_ProcessingUnittemperatures'),
                         ("# britney_spears britney_spears 's new single "
                          "-3' debuts at #1: video: congratulations are in order ..",
                          'britney_spears')]
        test_values = [Target(**test_value) for test_value in test_values]
        for index, test_value in enumerate(test_values):
            test_result = target_normalisation(test_value)
            valid_result = valid_results[index]
            self.assertEqual(valid_result, test_result, msg='Results is '\
                             '{} and should be {}. Test value {}'\
                             .format(test_result, valid_result, test_value))
            similarity_field = np.array(target_data[similarity_field_name])
            if is_language_model and threshold:
                similarity_field = target_data['original_perplexity'] - similarity_field
                above_original_perplexity_index = np.argmax((similarity_field <= 0) + 0) 
                similarity_field = similarity_field[:above_original_perplexity_index]
                if len(similarity_field) < k_similar:
                    temp_k = len(similarity_field)
            elif (not is_language_model) and threshold:
                above_threshold_index = np.argmin((similarity_field >= threshold) + 0)
                similarity_field = similarity_field[:above_threshold_index]
                if len(similarity_field) < k_similar:
                    temp_k = len(similarity_field)
            # For each of the filtered alternative targets it creates a json 
            # like object that will be used to store it in a collection to then 
            # save to a json file
            alternative_targets = target_data['alternative_targets'][:temp_k]
            for index, alternative_target in enumerate(alternative_targets):
                new_target_id = f'{target_id}_{index}'
                new_target_data = exchange_targets(target_data, alternative_target, 
                                                   new_target_id)
                # sanitizing the target dataset.
                new_target_data.pop('alternative_targets')
                new_target_data.pop(similarity_field_name)
                if is_language_model:
                    new_target_data.pop('original_perplexity')
                new_target_dataset.append(Target(**new_target_data))
    print(f'Size of the expanded dataset {len(new_target_dataset)}')
    new_target_dataset = TargetCollection(new_target_dataset)
    new_target_dataset.to_json_file(str(args.dataset_save_fp),cache=False)           
            
Пример #15
0
def _semeval_extract_data(sentences, file_name, conflict=False,
                          sentence_ids_skip=None):
    '''
    :param sentences: A `sentences` named element
    :param file_name: Name of the file being parsed
    :param conflict: Determine if to keep the target data that has a conflict \
    sentiment label.
    :param sentence_ids_skip: IDs of sentences that should be skipped
    :type sentences: xml.etree.ElementTree.Element
    :type file_name: String
    :type conflict: bool. Defailt False
    :type sentence_ids_skip: list. Default None
    :returns: A TargetCollection containing Target instances.
    :rtype: TargetCollection
    '''

    # Converts the sentiment tags from Strings to ints
    sentiment_mapper = {'conflict' : -2, 'negative' : -1,
                        'neutral' : 0, 'positive' : 1}

    def extract_aspect_terms(aspect_terms, sentence_id):
        '''
        :param aspect_terms: An aspectTerms element within the xml tree
        :param sentence_id: Id of the sentence that the aspects came from.
        :type aspect_terms: xml.etree.ElementTree.Element
        :type sentence_id: String
        :returns: A list of dictioanries containg id, span, sentiment and \
        target
        :rtype: list
        '''

        aspect_terms_data = []
        for index, aspect_term in enumerate(aspect_terms):
            aspect_term = aspect_term.attrib
            aspect_term_data = {}
            sentiment = sentiment_mapper[aspect_term['polarity']]
            if sentiment == -2 and not conflict:
                continue
            aspect_id = '{}{}'.format(sentence_id, index)
            aspect_term_data['target_id'] = aspect_id
            if 'term' in aspect_term:
                aspect_term_data['target'] = aspect_term['term']
            elif 'target' in aspect_term:
                aspect_term_data['target'] = aspect_term['target']
            else:
                raise KeyError('There is no `target` attribute in the opinions '\
                               'element {}'.format(aspect_term))
            aspect_term_data['sentiment'] = sentiment
            aspect_term_data['spans'] = [(int(aspect_term['from']),
                                          int(aspect_term['to']))]
            aspect_term_data['sentence_id'] = sentence_id
            # If the target is NULL then there is no target
            if aspect_term_data['target'] == 'NULL':
                continue
            aspect_terms_data.append(aspect_term_data)
        return aspect_terms_data

    def add_text(aspect_data, text):
        '''
        :param aspect_data: A list of dicts containing `span`, `target` and \
        `sentiment` keys.
        :param text: The text of the sentence that is associated to all of the \
        aspects in the aspect_data list
        :type aspect_data: list
        :type text: String
        :returns: The list of dicts in the aspect_data parameter but with a \
        `text` key with the value that the text parameter contains
        :rtype: list
        '''

        for data in aspect_data:
            data['text'] = text
        return aspect_data

    all_aspect_term_data = TargetCollection()
    for sentence in sentences:
        aspect_term_data = None
        text_index = None
        sentence_id = file_name + sentence.attrib['id']
        # Allow the parser to skip certain sentences
        if sentence_ids_skip is not None:
            if sentence.attrib['id'] in sentence_ids_skip:
                continue
        for index, data in enumerate(sentence):
            if data.tag == 'sentence':
                raise Exception(sentence.attrib['id'])
            if data.tag == 'text':
                text_index = index
            elif data.tag == 'aspectTerms' or data.tag == 'Opinions':
                aspect_term_data = extract_aspect_terms(data, sentence_id)
        if aspect_term_data is None:
            continue
        if text_index is None:
            raise ValueError('A semeval sentence should always have text '\
                             'semeval file {} sentence id {}'\
                             .format(file_name, sentence.attrib['id']))
        sentence_text = sentence[text_index].text
        aspect_term_data = add_text(aspect_term_data, sentence_text)
        for aspect in aspect_term_data:
            sent_target = Target(**aspect)
            all_aspect_term_data.add(sent_target)
    return all_aspect_term_data
Пример #16
0
    def test_target_constructor(self):
        '''
        Test that target constructor
        '''

        # Testing the spans types
        with self.assertRaises(TypeError, msg='Spans should be of type list'):
            Target('span', '1', 'Iphone', 'text with Iphone', 'Pos')
        with self.assertRaises(TypeError, msg='Spans should be list of tuples'):
            Target([1, 2], '1', 'Iphone', 'text with Iphone', 'Pos')
        with self.assertRaises(ValueError, msg='Spans should contain tuples of '\
                               'length 2'):
            Target([(1, 2, 3), (3, 4, 5)], '1', 'Iphone', 'text with Iphone', 'Pos')
        with self.assertRaises(ValueError, msg='Spans should contain tuples of '\
                               'length 2'):
            Target([(1, 2), (3, 4, 5)], '1', 'Iphone', 'text with Iphone', 'Pos')
        with self.assertRaises(TypeError, msg='Spans should contain tuples of '\
                               'length 2 and are Ints'):
            Target([('1', '2')], '1', 'Iphone', 'text with Iphone', 'Pos')
        with self.assertRaises(TypeError, msg='Spans should contain tuples of '\
                               'length 2 and are Ints'):
            Target([(1, '2')], '1', 'Iphone', 'text with Iphone', 'Pos')
        with self.assertRaises(TypeError, msg='Spans should contain tuples of '\
                               'length 2 and are Ints'):
            Target([('1', 2)], '1', 'Iphone', 'text with Iphone', 'Pos')
        with self.assertRaises(ValueError, msg='Spans should contain tuples of '\
                               'Ints where the first Int < second Int'):
            Target([(7, 5)], '1', 'Iphone', 'text with Iphone', 'Pos')
        with self.assertRaises(ValueError, msg='Spans should contain tuples of '\
                               'Ints where the first Int < second Int'):
            Target([(3, 5), (6, 6)], '1', 'Iphone', 'text with Iphone', 'Pos')
        # Testing that the spans work in a normal case
        Target([(3, 5), (6, 8)], '1', 'Iphone', 'text with Iphone', 'Pos')

        # Testing the target ID type
        with self.assertRaises(TypeError, msg='Target ID should be a String'):
            Target([(3, 5), (6, 8)], 1, 'Iphone', 'text with Iphone', 'Pos')

        # Testing target type
        with self.assertRaises(TypeError, msg='Target should be a String'):
            Target([(3, 5), (6, 8)], '1', ('Iphone',), 'text with Iphone', 'Pos')

        # Testing text type
        with self.assertRaises(TypeError, msg='Text should be a String'):
            Target([(3, 5), (6, 8)], '1', 'Iphone', ('text with Iphone',), 'Pos')

        # Testing sentiment type
        with self.assertRaises(TypeError, msg='Sentiment should be a String or '\
                               'Int'):
            Target([(3, 5), (6, 8)], '1', 'Iphone', 'text with Iphone', ('Pos',))
        # Testing the sentence_id type
        with self.assertRaises(TypeError, msg='sentence_id should be a String'):
            Target([(3, 5), (6, 8)], '1', 'Iphone', 'text with Iphone', 'pos',
                   sentence_id=1)
        # Testing the sentiment type works as an Integer (Normal case)
        span = [(3, 5)]
        target = 'Iphone'
        sentiment = 1
        text = 'text with Iphone'
        target_id = '210#1'
        predicted = 0
        sentence_id = '210'
        target_example = Target(span, target_id, target, text, sentiment,
                                predicted, sentence_id)

        # Testing that the dictionary mapping is correct
        self.assertEqual(target_id, target_example['target_id'],
                         msg='The target ID should {} and not {}'\
                         .format(target_id, target_example['target_id']))
        self.assertEqual(text, target_example['text'],
                         msg='The text should be {} and not {}'\
                         .format(text, target_example['text']))
        self.assertEqual(sentiment, target_example['sentiment'],
                         msg='The sentiment should be {} and not {}'\
                         .format(sentiment, target_example['sentiment']))
        self.assertEqual(target, target_example['target'],
                         msg='The target should be {} and not {}'\
                         .format(target, target_example['target']))
        self.assertEqual(span, target_example['spans'],
                         msg='The spans should be {} and not {}'\
                         .format(span, target_example['spans']))
        self.assertEqual(predicted, target_example['predicted'],
                         msg='The predicted sentiment should be {} and not {}'\
                         .format(predicted, target_example['predicted']))
        self.assertEqual(sentence_id, target_example['sentence_id'],
                         msg='The sentence_id should be {} and not {}'\
                         .format(sentence_id, target_example['sentence_id']))
Пример #17
0
def hu_liu(file_path):
    '''
    Parser for the datasets from the following two papers (DOES NOT WORK):

    1. `A Holistic Lexicon-Based Approach to Opinion Mining \
    <https://www.cs.uic.edu/~liub/FBS/opinion-mining-final-WSDM.pdf>`_
    2. `Mining and Summarizing Customer Reviews \
    <https://www.cs.uic.edu/~liub/publications/kdd04-revSummary.pdf>`_

    Currently this does not work. This is due to the dataset not containing
    enough data to determine where the targets are in the text.

    :param file_path: The path to a file containing annotations in the format \
    of hu and liu sentiment datasets.
    :type file_path: String
    :returns: A TargetCollection containing Target instances.
    :rtype: TargetCollection
    '''
    file_path = os.path.abspath(file_path)
    file_name = os.path.basename(file_path)
    sentiment_data = TargetCollection()

    with open(file_path, 'r', encoding='cp1252') as annotations:
        for sentence_index, annotation in enumerate(annotations):
            # If it does not contain ## then not a sentence
            if '##' not in annotation:
                continue
            targets_text = annotation.split('##')
            if len(targets_text) > 2 or len(targets_text) < 1:
                raise ValueError('The annotation {} when split on `##` should '\
                                 'contain at least the sentence text and at'\
                                 ' most the text and the targets and not {}'\
                                 .format(annotation, targets_text))
            # If it just contains the sentence text then go to next
            elif len(targets_text) == 1:
                continue
            elif targets_text[0].strip() == '':
                continue
            targets, text = targets_text
            targets = targets.strip()
            text = text.strip()
            sentence_id = file_name + '#{}'.format(sentence_index)

            targets = targets.split(',')
            for target_index, target in enumerate(targets):
                target = target.strip()
                sentiment_match = re.search(r'\[[+-]\d\]$', target)
                is_implicit = re.search(r'\[[up]\]', target)
                if is_implicit:
                    print('Target {} is implicit {}'.format(target, text))
                    continue
                if not sentiment_match:
                    raise ValueError('Target {} does not have a corresponding'\
                                     ' sentiment value. annotation {}'\
                                     .format(target, annotation))
                target_text = target[:sentiment_match.start()].strip()
                sentiment_text = sentiment_match.group().strip().strip('[]')
                sentiment_value = int(sentiment_text)

                target_matches = list(re.finditer(target_text, text))
                if len(target_matches) != 1:
                    print('The Target {} can only occur once in the '\
                          'text {}'.format(target_text, text))
                    continue
                    raise ValueError('The Target {} can only occur once in the '\
                                     'text {}'.format(target_text, text))
                target_span = target_matches[0].span()
                target_id = sentence_id + '#{}'.format(target_index)

                data_dict = {}
                data_dict['spans'] = [target_span]
                data_dict['target'] = target_text
                data_dict['sentiment'] = sentiment_value
                data_dict['text'] = text
                data_dict['sentence_id'] = sentence_id
                data_dict['target_id'] = target_id
                sentiment_data.add(Target(**data_dict))
    return sentiment_data
Пример #18
0
    def parse_tweet(tweet_data, anno_data, tweet_id):

        def get_offsets(entity, tweet_text, target):
            offset_shifts = [0, -1, 1]
            from_offset = entity['offset']
            for offset_shift in offset_shifts:
                from_offset_shift = from_offset + offset_shift
                to_offset = from_offset_shift + len(target)
                offsets = [(from_offset_shift, to_offset)]
                offset_text = tweet_text[from_offset_shift : to_offset].lower()
                if offset_text == target.lower():
                    return offsets
            raise ValueError('Offset {} does not match target text {}. Full '\
                             'text {}\nid {}'\
                             .format(from_offset, target, tweet_text, tweet_id))

        def fuzzy_target_match(tweet_text, target):
            low_target = target.lower()
            target_searches = [low_target, r'[^\w]' + low_target,
                               r'[^\w]' + low_target + r'[^\w]',
                               low_target + r'[^\w]',
                               low_target.replace(' ', ''),
                               low_target.replace(" '", '')]
            for target_search in target_searches:
                target_matches = list(re.finditer(target_search,
                                                  tweet_text.lower()))
                if len(target_matches) == 1:
                    return target_matches
            if tweet_id in set(['81211671026352128', '78689580104290305',
                                '81209490499960832']):
                return None
            if tweet_id == '75270720671973376' and target == 'kippers':
                return None
            if tweet_id == '65855178264686592' and target == 'tax':
                return None
            print(tweet_data)
            print(anno_data)
            raise ValueError('Cannot find the exact additional '\
                             'entity {} within the tweet {}'\
                             .format(target, tweet_text))



        target_instances = []
        tweet_id = str(tweet_id)
        tweet_text = tweet_data['content']
        target_ids = []
        # Parse all of the entities that have been detected automatically
        for entity in tweet_data['entities']:
            data_dict = {}
            target = entity['entity']
            target_ids.append(entity['id'])
            entity_id = str(entity['id'])
            data_dict['spans'] = get_offsets(entity, tweet_text, target)
            data_dict['target'] = entity['entity']
            data_dict['target_id'] = folder_name + tweet_id + '#' + entity_id
            data_dict['sentence_id'] = folder_name + tweet_id
            data_dict['sentiment'] = anno_data['items'][entity_id]
            if data_dict['sentiment'] == 'doesnotapply' and not include_dnr:
                continue
            # Convert from Strings to Integer
            data_dict['sentiment'] = sentiment_mapper[data_dict['sentiment']]
            data_dict['text'] = tweet_text
            target_instances.append(Target(**data_dict))
        # Parse all of the entities that have been selected by the user
        if include_additional:
            additional_data = anno_data['additional_items']
            if isinstance(additional_data, dict):
                for target, sentiment in additional_data.items():
                    target_matches = fuzzy_target_match(tweet_text, target)
                    if target_matches is None:
                        continue
                    target_id = max(target_ids) + 1
                    target_ids.append(target_id)
                    data_dict['spans'] = [target_matches[0].span()]
                    data_dict['target'] = target
                    data_dict['sentiment'] = sentiment
                    data_dict['text'] = tweet_text
                    data_dict['sentence_id'] = tweet_id
                    data_dict['target_id'] = tweet_id + '#' + str(target_id)
                    target_instances.append(Target(**data_dict))

        return target_instances
Пример #19
0
    def test_dependency_context(self):
        '''
        Tests dependency_context
        '''

        # Test the multiple span cases
        test_values = [{'target_id':str(0),
                        'sentiment':-1,
                        'text':'This is a fake news articledd that is to represent a '\
                        'Tweet!!!! and it was an awful News Articless I think.',
                        'target':'news article',
                        'spans':[(15, 27), (83, 95)]},
                       {'target_id':str(1),
                        'sentiment':1,
                        'text':'I had a great Day however I did not get much '\
                        'work done in the days',
                        'target':'day',
                        'spans':[(14, 17), (62, 65)]},
                       {'spans': [(1, 14), (15, 29)],
                        'target_id': '8',
                        'target': 'britney spears',
                        'text': "#britneyspears Britney Spears 's new single "\
                                "-3' debuts at #1: video: congratulations are "\
                                "in order ..",
                        'sentiment': 0},
                       {'spans': [(5, 14)],
                        'target_id': '9',
                        'target': 'wait time',
                        'text': "less wait time for me!",
                        'sentiment': 0},
                       {'spans': [(81, 91)],
                        'target_id': '9',
                        'target': '@RichardWS',
                        'text': "Lower taxes rendered null & void with the "\
                                "planned increase in VAT by the tories. "\
                                "@RichardWS #battlefornumber10 @RichardWS",
                        'sentiment': 0},
                       {'spans': [(46, 57)],
                        'target_id': '9',
                        'target': '@Shaun_XL5:',
                        'text': "< My main #GE2015 policy worry is now defence "\
                                "@Shaun_XL5: I think we need Nato target of 2% "\
                                "GDP with hardcore auditing  @paullewismoney",
                        'sentiment': 0},
                       {'spans': [(102, 119)],
                        'target_id': '9',
                        'target': 'first-time buyers',
                        'text': "Despite spin, no new government money for "\
                                "#housing: transferring resources from affordable"\
                                " renting to first-time buyers. #GE2015 #GE15",
                        'sentiment': 0},
                       {'spans': [(130, 140)],
                        'target_id': '9',
                        'target': '@UNICEF_uk!',
                        'text': "Violence against children around the world is "\
                                "like a hidden epidemic. Michael Sheen talking "\
                                "well abt SDGs on #marrshow Nice work @UNICEF_uk!",
                        'sentiment': 0},
                       {'spans': [(21, 32)],
                        'target_id': '9',
                        'target': 'Tony #Blair',
                        'text': "Corrupt war criminal Tony #Blair's rats are "\
                                "running amok inside Labour today. If I was "\
                                "thinking of voting Labour today. I'd stop. #GE15",
                        'sentiment': 0},
                       {'spans': [(39, 49)],
                        'target_id': '9',
                        'target': '@UKLabour;',
                        'text': "Day 1 of #GE2015 & I'm already sick of "\
                                "@UKLabour; their lies & their forgetfulness. "\
                                "Whose policies created the need for austerity?"\
                                " Yes. Lab!",
                        'sentiment': 0},
                       {'spans': [(33, 46)],
                        'target_id': '9',
                        'target': '@GrantShapps’',
                        'text': "Lest Cameron forgets: Police say @GrantShapps’ "\
                                "firm sales “may constitute offence of fraud” "\
                                "#bbcdp #pmqs : http://t.co/Gu9Ke6sRtX",
                        'sentiment': 0}]
        valid_results = [[{'text' : 'This is a fake news article',
                           'span' : (15, 27)},
                          {'text' : 'dd that is to represent a Tweet and it was '\
                                    'an awful news article',
                           'span' : (52, 64)}],
                         [{'text' : 'I had a great day however I did not get '\
                                    'much work done in the day',
                           'span' : (14, 17)},
                          {'text' : 'I had a great day however I did not get '\
                                    'much work done in the day',
                           'span' : (62, 65)}],
                         [{'text' : "britney spears britney spears",
                           'span' : (0, 14)},
                          {'text' : "britney spears britney spears",
                           'span' : (15, 29)}],
                         [{'text' : 'less wait time for me',
                           'span' : (5, 14)}],
                         [{'text' : '@RichardWS',
                           'span' : (0, 10)}],
                         [{'text' : "My main #GE2015 policy worry is now defence"\
                                    " @Shaun_XL5:", 'span' : (44, 55)}],
                         [{'text' : "transferring resources from affordable "\
                                    "renting to first-time buyers",
                           'span' : (50, 67)}],
                         [{'text' : "@UNICEF_uk!", 'span' : (0, 11)}],
                         [{'text' : "criminal Tony #Blair",
                           'span' : (9, 20)}],
                         [{'text' : "Day 1 of #GE2015 & I'm already sick of "\
                                    "@UKLabour; their lies & their forgetfulness",
                           'span' : (39, 49)}],
                         [{'text' : "Lest Cameron forgets Police say "\
                                    "@GrantShapps’ firm sales",
                           'span' : (32, 45)}]]

        test_values = [Target(**test_value) for test_value in test_values]
        test_results = dependency_context(test_values, tweebo)
        for index, valid_result in enumerate(valid_results):
            test_result = test_results[index]
            for dict_index, valid_dict in enumerate(valid_result):
                test_dict = test_result[dict_index]
                self.assertEqual(valid_dict['text'], test_dict['text'],
                                 msg='texts are different correct `{}` test `{}`'\
                                     .format(valid_dict['text'], test_dict['text']))
                self.assertEqual(valid_dict['span'], test_dict['span'],
                                 msg='spans are different correct `{}` test `{}`'\
                                     ' text `{}`'.format(valid_dict['span'],
                                                         test_dict['span'],
                                                         test_dict['text']))
        # Test the lower casing case of the text and if the target is upper case
        test_values = [{'target_id':str(0),
                        'sentiment':-1,
                        'text':'This is a fake news articledd that is to represent a '\
                        'Tweet!!!! and it was an awful News Articless I think.',
                        'target':'news article',
                        'spans':[(15, 27), (83, 95)]},
                       {'target_id':str(0),
                        'sentiment':-1,
                        'text':'This is a fake news articledd that is to represent a '\
                        'Tweet!!!! and it was an awful News Articless I think.',
                        'target':'News Article',
                        'spans':[(15, 27), (83, 95)]},
                       {'target_id':str(1),
                        'sentiment':1,
                        'text':'I had a great Day however I did not get much '\
                        'work done in the days',
                        'target':'day',
                        'spans':[(14, 17), (62, 65)]}]
        valid_results = [[{'text' : 'this is a fake news article',
                           'span' : (15, 27)},
                          {'text' : 'dd that is to represent a tweet and it was '\
                                    'an awful news article',
                           'span' : (52, 64)}],
                         [{'text' : 'this is a fake news article',
                           'span' : (15, 27)},
                          {'text' : 'dd that is to represent a tweet and it was '\
                                    'an awful news article',
                           'span' : (52, 64)}],
                         [{'text' : 'i had a great day however i did not get '\
                                    'much work done in the day',
                           'span' : (14, 17)},
                          {'text' : 'i had a great day however i did not get '\
                                    'much work done in the day',
                           'span' : (62, 65)}]]
        test_values = [Target(**test_value) for test_value in test_values]
        test_results = dependency_context(test_values, tweebo, lower=True)
        for index, valid_result in enumerate(valid_results):
            test_result = test_results[index]
            for dict_index, valid_dict in enumerate(valid_result):
                test_dict = test_result[dict_index]
                self.assertEqual(valid_dict['text'], test_dict['text'],
                                 msg='texts are different correct `{}` test `{}`'\
                                     .format(valid_dict['text'], test_dict['text']))
                self.assertEqual(valid_dict['span'], test_dict['span'],
                                 msg='spans are different correct `{}` test `{}`'\
                                     .format(valid_dict['span'], test_dict['span']))
        # Test the case where the target is mentioned twice but only 1 is relevant
        # to one of the mentions
        test_values = [{'target_id':str(1),
                        'sentiment':1,
                        'text':'I had a great Day however I did not get much '\
                        'work done in the day',
                        'target':'day',
                        'spans':[(14, 17)]},
                       {'target_id':str(3),
                        'sentiment':1,
                        'text':'I had a great Day however I did not get much '\
                        'work done in the Day',
                        'target':'day',
                        'spans':[(14, 17)]},
                        {'spans': [(47, 80)],
                         'target_id': '2',
                         'target': 'Core Processing Unit temperatures',
                         'text': 'Temperatures were ok but I was not tracking'\
                                 ' in Core Processing Unit temperatures.',
                         'sentiment': 0},
                        {'text' : 'I also recommend the rice dishes or the '\
                                  'different varieties of congee (rice porridge).',
                         'spans' : [(63, 85)],
                         'target_id' : '4',
                         'target' : 'congee (rice porridge)',
                         'sentiment' : 1}]
        valid_results = [[{'text' : 'I had a great day however I did not get '\
                                    'much work done in the day',
                           'span' : (14, 17)}],
                          [{'text' : 'I had a great day however I did not get '\
                                     'much work done in the Day',
                            'span' : (14, 17)}],
                          [{'text' : 'Temperatures were ok but I was not tracking'\
                                     ' in Core Processing Unit temperatures',
                            'span' : (47, 80)}],
                          [{'text' : 'I also recommend the rice dishes or the '\
                                     'different varieties of congee (rice '\
                                     'porridge)',
                            'span' : (63, 85)}]]
        test_values = [Target(**test_value) for test_value in test_values]
        test_results = dependency_context(test_values, tweebo)
        for index, valid_result in enumerate(valid_results):
            test_result = test_results[index]
            for dict_index, valid_dict in enumerate(valid_result):
                test_dict = test_result[dict_index]
                self.assertEqual(valid_dict['text'], test_dict['text'],
                                 msg='texts are different correct `{}` test `{}`'\
                                     .format(valid_dict['text'], test_dict['text']))
                self.assertEqual(valid_dict['span'], test_dict['span'],
                                 msg='spans are different correct `{}` test `{}`'\
                                     .format(valid_dict['span'], test_dict['span']))
Пример #20
0
    def test_targetcoll_sent_data(self):
        '''
        Test the sentiment_data function of TargetCollection
        '''

        target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone',
                                  'text with Iphone', 1)
        target_example_1 = Target([(1, 5)], '3', 'Iphone',
                                  'text with Iphone', 1)
        target_example_2 = Target([(1, 2)], '2', 'Iphone',
                                  'text with Iphone', -1)
        target_col_int = TargetCollection([target_example_0, target_example_1,
                                           target_example_2])

        target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone',
                                  'text with Iphone', 'pos')
        target_example_1 = Target([(1, 5)], '3', 'Iphone',
                                  'text with Iphone', 'pos')
        target_example_2 = Target([(1, 2)], '2', 'Iphone',
                                  'text with Iphone', 'neg')
        target_col_str = TargetCollection([target_example_0, target_example_1,
                                           target_example_2])
        # Testing the basic example
        test_sentiments = target_col_int.sentiment_data()
        valid_sentiments = [1, 1, -1]
        self.assertEqual(valid_sentiments, test_sentiments, msg='The Integer '\
                         'sentiments returned should be {} and not {}'\
                         .format(valid_sentiments, test_sentiments))
        test_sentiments = target_col_str.sentiment_data()
        valid_sentiments = ['pos', 'pos', 'neg']
        self.assertEqual(valid_sentiments, test_sentiments, msg='The String '\
                         'sentiments returned should be {} and not {}'\
                         .format(valid_sentiments, test_sentiments))

        # Testing the mapping function
        str_mapper = {'pos' : 1, 'neg' : -1}
        test_sentiments = target_col_str.sentiment_data(mapper=str_mapper)
        valid_sentiments = [1, 1, -1]
        self.assertEqual(valid_sentiments, test_sentiments, msg='The String '\
                         'sentiments should be mapped to Integers. Valid {} '\
                         'not {}'.format(valid_sentiments, test_sentiments))
        int_mapper = {1 : 'pos', -1 : 'neg'}
        test_sentiments = target_col_int.sentiment_data(mapper=int_mapper)
        valid_sentiments = ['pos', 'pos', 'neg']
        self.assertEqual(valid_sentiments, test_sentiments, msg='The Integer '\
                         'sentiments should be mapped to String. Valid {} '\
                         'not {}'.format(valid_sentiments, test_sentiments))

        with self.assertRaises(TypeError, msg='Should only accept dict mapper'):
            target_col_int.sentiment_data(mapper=[(1, 'pos'), (-1, 'neg')])
        with self.assertRaises(ValueError, msg='Mapper should refuse dicts that'\
                               ' may have valid mappings but not all the mappings'):
            target_col_int.sentiment_data(mapper={1 : 'pos'})
        with self.assertRaises(ValueError, msg='Mapper should refuse dicts that'\
                               ' contain the correct number of mappings but not '\
                               'the correct mappings'):
            target_col_int.sentiment_data({0 : 'pos', -1 : 'neg'})
        with self.assertRaises(ValueError, msg='Mapper should refuse dicts that '\
                               'have all the correct mappings but contain some '\
                               'in-correct mappings'):
            target_col_int.sentiment_data(mapper={1 : 'pos', -1 : 'neg',
                                                  0 : 'neu'})

        # Testing the sentiment_field parameter
        target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone',
                                  'text with Iphone', 'pos', 'neg')
        target_example_1 = Target([(1, 5)], '3', 'Iphone',
                                  'text with Iphone', 'pos', 'neu')
        target_example_2 = Target([(1, 2)], '2', 'Iphone',
                                  'text with Iphone', 'neg', 'pos')
        target_col = TargetCollection([target_example_0, target_example_1,
                                       target_example_2])
        test_sentiments = target_col.sentiment_data(sentiment_field='predicted')
        valid_sentiments = ['neg', 'neu', 'pos']
        self.assertEqual(valid_sentiments, test_sentiments, msg='The predicted '\
                         'sentiments returned should be {} and not {}'\
                         .format(valid_sentiments, test_sentiments))
Пример #21
0
    def test_dependency_relation_context(self):
        '''
        Tests dependency_relation_context
        '''

        # Test the normalise case
        test_values = [{'target_id':str(0),
                        'sentiment':-1,
                        'text':'This is a fake news articledd that is to represent a '\
                        'Tweet!!!! and it was an awful News Articless I think.',
                        'target':'news article',
                        'spans':[(15, 27), (83, 95)]},
                       {'target_id':str(1),
                        'sentiment':1,
                        'text':'I had a great Day however I did not get much '\
                        'work done in the days',
                        'target':'day',
                        'spans':[(14, 17), (62, 65)]},
                       {'target_id':str(2),
                        'sentiment':1,
                        'text':'this is an alternative sentence to see how it '\
                        'performs',
                        'target':'sentence',
                        'spans':[(23, 31)]},
                       {'target_id':str(3),
                        'sentiment':1,
                        'text':'Teen pop star heartthrobe is all the rage on '\
                        'social media',
                        'target':'teen',
                        'spans':[(0, 4)]}]
        valid_results = [['a fake', 'an awful'], ['a great', 'the'],
                         ['an alternative to see performs it how'], ['']]
        test_values = [Target(**test_value) for test_value in test_values]
        test_results = dependency_relation_context(test_values,
                                                   tweebo,
                                                   n_relations=(1, -1))
        for index, valid_result in enumerate(valid_results):
            test_result = test_results[index]
            self.assertEqual(valid_result, test_result, msg='Incorrect context'\
                             ' correct {} test {}'.format(valid_result, test_result))

        # Testing when we only want the first dependency relation
        valid_results = [['a fake', 'an awful'], ['a great', 'the'],
                         ['an alternative to'], ['']]
        test_results = dependency_relation_context(test_values, tweebo)
        for index, valid_result in enumerate(valid_results):
            test_result = test_results[index]
            self.assertEqual(valid_result, test_result, msg='Incorrect context'\
                             ' correct {} test {}'.format(valid_result, test_result))

        # Testing to ensure it will lower case the words before processing
        valid_results = [['a fake', 'an awful'], ['a great', 'the'],
                         ['an alternative to see performs it how'], ['']]
        test_results = dependency_relation_context(test_values, tweebo, True,
                                                   (1, -1))
        for index, valid_result in enumerate(valid_results):
            test_result = test_results[index]
            self.assertEqual(valid_result, test_result, msg='Incorrect context'\
                             ' correct {} test {}'.format(valid_result, test_result))

        # Testing for when a sentence mentions the target more than once but we
        # are only interested in the first mention
        test_values = [{'target_id':str(1),
                        'sentiment':1,
                        'text':'I had a great Day however I did not get much '\
                        'work done in the day',
                        'target':'day',
                        'spans':[(14, 17)]}]
        valid_results = [['a great']]
        test_values = [Target(**test_value) for test_value in test_values]
        test_results = dependency_relation_context(test_values,
                                                   tweebo,
                                                   n_relations=(1, -1))
        for index, valid_result in enumerate(valid_results):
            test_result = test_results[index]
            self.assertEqual(valid_result, test_result, msg='Incorrect context'\
                             ' for more than one mention correct {} test {}'\
                             .format(valid_result, test_result))
Пример #22
0
    def test_target_coll_subset_by_sent(self):
        '''
        Test the subset_by_sentiment function of TargetCollection
        '''

        target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone',
                                  'text with Iphone', 'pos', sentence_id='4')
        target_example_1 = Target([(1, 5)], '3', 'Iphone',
                                  'text with Iphone', 'neg', sentence_id='4')
        target_example_2 = Target([(1, 2)], '2', 'Iphone',
                                  'text with Iphone', 'neg', sentence_id='5')
        target_example_3 = Target([(1, 2)], '4', 'Iphone',
                                  'text with Iphone', 'neg', sentence_id='5')
        target_example_4 = Target([(1, 2)], '5', 'Iphone',
                                  'text with Iphone', 'pos', sentence_id='6')
        target_example_5 = Target([(1, 2)], '6', 'Iphone',
                                  'text with Iphone', 'neu', sentence_id='6')
        target_example_6 = Target([(1, 2)], '7', 'Iphone',
                                  'text with Iphone', 'neg', sentence_id='6')
        target_example_7 = Target([(1, 2)], '8', 'Iphone',
                                  'text with Iphone', 'neg', sentence_id='7')
        target_example_8 = Target([(1, 2)], '9', 'Iphone',
                                  'text with Iphone', 'neg', sentence_id='8')
        target_example_9 = Target([(1, 2)], '10', 'Iphone',
                                  'text with Iphone', 'neg', sentence_id='8')
        target_example_10 = Target([(1, 2)], '11', 'Iphone',
                                  'text with Iphone', 'pos', sentence_id='8')
        all_targets = [target_example_0, target_example_1, target_example_2,
                       target_example_3, target_example_4, target_example_5,
                       target_example_6, target_example_7, target_example_8,
                       target_example_9, target_example_10]
        target_col = TargetCollection(all_targets)

        # Test for 2 unique sentiments per sentence
        test_col = target_col.subset_by_sentiment(2)
        valid_col = TargetCollection([target_example_0, target_example_1,
                                      target_example_8, target_example_9,
                                      target_example_10])
        self.assertEqual(valid_col, test_col, msg='Should only return these {}'\
                         ' but has returned this {}'.format(valid_col, test_col))
        # Test for 1 unique sentiments per sentence
        test_col = target_col.subset_by_sentiment(1)
        valid_col = TargetCollection([target_example_7, target_example_2,
                                      target_example_3])
        self.assertEqual(valid_col, test_col, msg='Should only return these {}'\
                         ' but has returned this {}'.format(valid_col, test_col))
        # Test for 3 unique sentiments per sentence
        test_col = target_col.subset_by_sentiment(3)
        valid_col = TargetCollection([target_example_4, target_example_5,
                                      target_example_6])
        self.assertEqual(valid_col, test_col, msg='Should only return these {}'\
                         ' but has returned this {}'.format(valid_col, test_col))