def add_numeric_values_to_questions(interaction): """Adds numeric value spans to all questions.""" for question in interaction.questions: question.text = text_utils.normalize_for_match(question.original_text) question.annotations.CopyFrom( interaction_pb2.NumericValueSpans( spans=number_utils.parse_text(question.text)))
def _add_numeric_reference_from_cell( cell, references, row_index, column_index, ): """Adds number and date references.""" text = text_utils.normalize_for_match(cell.text) spans = number_utils.parse_text(text) for span in spans: # Only keep spans that match the entire cell. if span.end_index - span.begin_index != len(text): continue for value in span.values: if _is_numerically_one(value): # One is special because of singuglar/plural and the pronoun. continue identifier, reference_type = _to_identifier(text, value) _add_identifier( identifier, reference_type, cell.text, references, row_index, column_index, )
def test_parse_range(self): text = '2005-2010' expected_spans = """ spans { begin_index: 0 end_index: 4 values { float_value: 2005. } values { date { year: 2005 } } } spans { begin_index: 5 end_index: 9 values { float_value: 2010. } values { date { year: 2010 } } } """ self.assertEqual(_get_spans(expected_spans), number_utils.parse_text(text))
def test_dont_parse_weird_examples(self): for text in ('1....', '1,,,,,,', '10000,..', '-.1,,,,00,20'): spans = number_utils.parse_text(text) self.assertNotEmpty(spans) for span in spans: # Make sure that we don't parse the entire text as a number. self.assertNotEqual((span.begin_index, span.end_index), (0, len(text)), text)
def test_parse_complete_dates(self, text, day, month, year): span = interaction_pb2.NumericValueSpan( begin_index=0, end_index=len(text), values=[ interaction_pb2.NumericValue( date=interaction_pb2.Date(year=year, month=month, day=day)) ]) self.assertEqual([span], number_utils.parse_text(text))
def test_parse_text(self): expected_spans = """ spans { begin_index: 0 end_index: 9 values { float_value: 1000000.0 } } spans { begin_index: 12 end_index: 18 values { float_value: 10000.0 } } spans { begin_index: 19 end_index: 32 values { date { year: 1846 month: 11 } } } spans { begin_index: 36 end_index: 49 values { date { year: 1847 month: 2 } } } spans { begin_index: 53 end_index: 57 values { float_value: 1908.0 } values { date { year: 1908 } } }""" self.assertEqual( _get_spans(expected_spans), number_utils.parse_text( '1,000,000, $10,000 November 1846 to February 1847 in 1908'))
def _get_question_references(question): """Converts numeric and entity annotations in question to references.""" references = {} spans = number_utils.parse_text( text_utils.normalize_for_match(question.original_text)) for span in spans: for value in span.values: if _is_numerically_one(value): # One is special because of singular/plural and the pronoun. continue text = question.original_text[span.begin_index:span.end_index] identifier, reference_type = _to_identifier(text, value) _add_identifier( identifier, reference_type, text, references, span.begin_index, span.end_index, ) annotated_text = question.Extensions[ annotated_text_pb2.AnnotatedText.annotated_question_ext] for annotation in annotated_text.annotations: begin_index = annotation.begin_byte_index end_index = annotation.end_byte_index _add_identifier( annotation.identifier, ReferenceType.ENTITY, question.original_text[begin_index:end_index], references, begin_index, end_index, ) return references
def _get_numeric_values(text): """Parses text and returns numeric values.""" numeric_spans = number_utils.parse_text(text) return itertools.chain(*(span.values for span in numeric_spans))
def test_parse_numerals(self, text): span = interaction_pb2.NumericValueSpan( begin_index=0, end_index=len(text), values=[interaction_pb2.NumericValue(float_value=12)]) self.assertEqual([span], number_utils.parse_text(text))
def test_ignore_nans(self): self.assertEmpty(number_utils.parse_text('inf')) self.assertEmpty(number_utils.parse_text('nan')) self.assertEmpty(number_utils.parse_text('Nan Hayworth'))