def context_texts( context_data_dicts: List[Dict[str, Any]] ) -> Tuple[List[str], List[str]]: ''' :param context_data_dicts: A list of dictonaries that contains a `text` and `spans` field. :return: A list of the left and right text contexts for all the dictionaries. ''' # Context returns all of the left and right context occurrences # therefore if a target is mentioned Twice and are associated then # for a single text two left and right occurrences are returned. # Thus these are a list of lists we therefore chose only the # first mentioned target as the paper linked to this method does # not specify which they used. left_texts = [ context(data, 'left', inc_target=self.include_target) for data in context_data_dicts ] right_texts = [ context(data, 'right', inc_target=self.include_target) for data in context_data_dicts ] left_texts = [texts[0] for texts in left_texts] right_texts = [texts[0] for texts in right_texts] return left_texts, right_texts
def test_full_context(self): ''' Tests :py:func:`bella.contexts.full_context` ''' single_targets = [ ['This is a fake news article that is to represent a Tweet!!!!'], ['I had a great day however I did not get much work done'], ['I cycled in today and it was ok as it was not raining.'] ] multi_targets = [['This is a fake news article that is to represent a '\ 'Tweet!!!! and it was an awful News Article I think.', 'This is a fake news article that is to represent a '\ 'Tweet!!!! and it was an awful News Article I think.'], ['I had a great Day however I did not get much '\ 'work done in the day', 'I had a great Day however I did not get much '\ 'work done in the day']] for index, test_context in enumerate(self.single_context): test_text = test_context['text'] correct_target = single_targets[index] target_string = context(test_context, 'full') msg = 'Cannot get the target for text {}, target found {} correct {}'\ .format(test_text, target_string, correct_target) self.assertEqual(correct_target, target_string, msg=msg) for index, test_context in enumerate(self.multi_contexts): test_text = test_context['text'] correct_targets = multi_targets[index] target_strings = context(test_context, 'full') msg = 'Cannot get the targets for text {}, targets found {} correct {}'\ .format(test_text, target_strings, correct_targets) self.assertEqual(correct_targets, target_strings, msg=msg)
def test_context(self): ''' Tests :py:func:`bella.contexts._context` ''' with self.assertRaises(ValueError, msg='Should only accept left, right '\ 'or target context words for parameters'): context(self.single_context[0], 'itself')
def test_right_context(self): ''' Tests :py:func:`bella.contexts.right_context` ''' single_right = [[' that is to represent a Tweet!!!!'], [' however I did not get much work done'], [' in today and it was ok as it was not raining.']] for index, test_context in enumerate(self.single_context): test_text = test_context['text'] test_target = test_context['target'] correct_context = single_right[index] right_string = context(test_context, 'right', inc_target=False) msg = 'Cannot get the right context of target {} text {} '\ 'which should be {} and not {}'\ .format(test_target, test_text, correct_context, right_string) self.assertEqual(correct_context, right_string, msg=msg) # Handle including targets single_right = [ ['news article that is to represent a Tweet!!!!'], ['day however I did not get much work done'], ['cycled in today and it was ok as it was not raining.'] ] for index, test_context in enumerate(self.single_context): test_text = test_context['text'] test_target = test_context['target'] correct_context = single_right[index] right_string = context(test_context, 'right', inc_target=True) msg = 'Cannot get the right context of target {} text {} including the '\ 'target which should be {} and not {}'\ .format(test_target, test_text, correct_context, right_string) self.assertEqual(correct_context, right_string, msg=msg) multi_right = [[' that is to represent a Tweet!!!! and it was an awful News'\ ' Article I think.', ' I think.'], [' however I did not get much work done in the day', '']] for index, test_context in enumerate(self.multi_contexts): test_text = test_context['text'] test_target = test_context['target'] correct_context = multi_right[index] right_string = context(test_context, 'right', inc_target=False) msg = 'Cannot get the right context of target {} text {} which should be {}'\ ' and not {}'\ .format(test_target, test_text, correct_context, right_string) self.assertEqual(correct_context, right_string, msg=msg) # Handle including targets multi_right = [['news article that is to represent a Tweet!!!! and it was'\ ' an awful News Article I think.', 'News Article I think.'], ['Day however I did not get much work done in the day', 'day']] for index, test_context in enumerate(self.multi_contexts): test_text = test_context['text'] test_target = test_context['target'] correct_context = multi_right[index] right_string = context(test_context, 'right', inc_target=True) msg = 'Cannot get the right context of target {} text {} including the '\ 'target which should be {} and not {}'\ .format(test_target, test_text, correct_context, right_string) self.assertEqual(correct_context, right_string, msg=msg)
def test_left_context(self): ''' Tests :py:func:`bella.contexts.left_context` ''' single_left = [['This is a fake '], ['I had a great '], ['I ']] for index, test_context in enumerate(self.single_context): test_text = test_context['text'] test_target = test_context['target'] correct_context = single_left[index] left_string = context(test_context, 'left', inc_target=False) msg = 'Cannot get the left context of target {} text {} which should be {}'\ ' and not {}'.format(test_target, test_text, correct_context, left_string) self.assertEqual(correct_context, left_string, msg=msg) # Handle including targets single_left = [['This is a fake news article'], ['I had a great day'], ['I cycled']] for index, test_context in enumerate(self.single_context): test_text = test_context['text'] test_target = test_context['target'] correct_context = single_left[index] left_string = context(test_context, 'left', inc_target=True) msg = 'Cannot get the left context of target {} text {} including the '\ 'target which should be {} and not {}'\ .format(test_target, test_text, correct_context, left_string) self.assertEqual(correct_context, left_string, msg=msg) multi_left = [['This is a fake ', 'This is a fake news article that is to'\ ' represent a Tweet!!!! and it was an awful '], ['I had a great ', 'I had a great Day however I did not get '\ 'much work done in the ']] for index, test_context in enumerate(self.multi_contexts): test_text = test_context['text'] test_target = test_context['target'] correct_context = multi_left[index] left_string = context(test_context, 'left', inc_target=False) msg = 'Cannot get the left context of target {} text {} which should be {}'\ ' and not {}'.format(test_target, test_text, correct_context, left_string) self.assertEqual(correct_context, left_string, msg=msg) # Handle including targets multi_left = [['This is a fake news article', 'This is a fake news article '\ 'that is to represent a Tweet!!!! and it was an awful News Article'], ['I had a great Day', 'I had a great Day however I did not get '\ 'much work done in the day']] for index, test_context in enumerate(self.multi_contexts): test_text = test_context['text'] test_target = test_context['target'] correct_context = multi_left[index] left_string = context(test_context, 'left', inc_target=True) msg = 'Cannot get the left context of target {} text {} including the '\ 'target which should be {} and not {}'\ .format(test_target, test_text, correct_context, left_string) self.assertEqual(correct_context, left_string, msg=msg)
def exchange_targets(target_data: Dict[str, Any], target: str, new_target_id: str) -> Dict[str, Any]: ''' Given a single Target data point it will replace the target text within it as well as the target field with the given target. It also adds a field `augmented` = True so that it is known that this data point is an augmented data point. :param target_data: The Target data dict that is to have its text, spans, and target field with the given target. :param target: The target string to replace the exisiting target. :param new_target_id: A unique target identifier, this is required so that the TargetCollection that is created for data augmentation has new identifiers for each target. However the original target_id that created this target will be accessible through the 'original_target_id' field/key. :returns: The Target data dict with the target and its related data replaced with the given target. ''' data_copy = copy.deepcopy(target_data) left_context = context(data_copy, 'left')[0] right_context = context(data_copy, 'right')[0] alternative_text = left_context + target + right_context # Finding the span of the new target within the new text start = len(left_context) end = len(target) + start alternative_span = [(start, end)] # Changing the fields original_target_id = data_copy['target_id'] # Required when we are augmenting on top of augmentation if 'original_target_id' in data_copy: original_target_id = data_copy['original_target_id'] data_copy['original_target_id'] = original_target_id data_copy['text'] = alternative_text data_copy['target'] = target data_copy['spans'] = alternative_span data_copy['augmented'] = True data_copy['target_id'] = new_target_id return data_copy
def _json_to_instance(self, json_dict: JsonDict) -> Instance: """ Expects JSON that looks like ``{"text": "...", "target": "...."}``. Returns that json object as an Instance based on the dataset readers `text_to_instance` method. """ text = json_dict["text"] target = json_dict["target"] if isinstance(self._dataset_reader, TargetDatasetReader): return self._dataset_reader.text_to_instance(text, target) # This allows the Target and TDLSTM models to use the same predictors. include_target = self._dataset_reader.incl_target left_text = context(json_dict, 'left', inc_target=include_target)[0] right_text = context(json_dict, 'right', inc_target=include_target)[0] return self._dataset_reader.text_to_instance(left_text, right_text, target)
def test_target_context(self): ''' Tests :py:func:`bella.contexts.target_context` ''' single_targets = [['news article'], ['day'], ['cycled']] for index, test_context in enumerate(self.single_context): test_text = test_context['text'] correct_target = single_targets[index] target_string = context(test_context, 'target') msg = 'Cannot get the target for text {}, target found {} correct {}'\ .format(test_text, target_string, correct_target) self.assertEqual(correct_target, target_string, msg=msg) multi_targets = [['news article', 'News Article'], ['Day', 'day']] for index, test_context in enumerate(self.multi_contexts): test_text = test_context['text'] correct_targets = multi_targets[index] target_strings = context(test_context, 'target') msg = 'Cannot get the targets for text {}, targets found {} correct {}'\ .format(test_text, target_strings, correct_targets) self.assertEqual(correct_targets, target_strings, msg=msg)
def _read(self, file_path): print(file_path) with open(file_path, "r") as data_file: # This should be a logger logger.info("Reading instances from lines in file at: " f"{file_path}") for line in data_file: line = line.strip("\n") if not line: continue target_data = json.loads(line) target = target_data['target'] left_text = context(target_data, 'left', inc_target=self.incl_target)[0] right_text = context(target_data, 'right', inc_target=self.incl_target)[0] epoch_numbers = None if 'epoch_number' in target_data: epoch_numbers = list(target_data['epoch_number']) sentiment = self.sentiment_mapper[target_data['sentiment']] yield self.text_to_instance(left_text, right_text, target, sentiment, epoch_numbers)
def transform(self, target_dicts): ''' Given a list of target dictionaries containing the spans of the targets and the texts that are about the targets it returns the relevant left, right and target contexts with respect to the target word(s). Returns a list of contexts. :param target_dicts: list of dictionaries containing at least `spans` and \ `text` keys. :type target_dicts: list :returns: a list of left, right and target contexts with respect to the \ target word and the values in the self.context if self.context = 'lt' will \ only return the left and target contexts and not right. :rtype: list ''' all_context_data = [] for target_dict in target_dicts: context_data = [] context_data.extend( contexts.context(target_dict, self.context, self.inc_target)) all_context_data.append(context_data) return all_context_data