def _detect_number_from_words(self, number_list=None, original_list=None):
        """
        Detect numbers from number words, for example - "two thousand", "One hundred twenty two".
        How it works?
            First it splits the text checking if any of '-' or ':' is present in text, and pass the split text
            to number word detector, which return the number value and original word from which it is being detected.
            Further we check for unit in suffix and prefix of original string and update that if any units are found.
        Args:
            number_list (list): list containing detected numeric text
            original_list (list): list containing original numeral text
        Returns:
            number_list (list): list containing updated detected numeric text
            original_list (list): list containing updated original numeral text

        Examples:
            [In]  >>  self.processed_text = "One hundred two"
            [In]  >>  _detect_number_from_numerals()
            [Out] >> ([{'value': '102', 'unit': None}], ['one hundred two two'])

            [In]  >>  self.processed_text = "two hundred - three hundred"
            [In]  >>  _detect_number_from_numerals()
            [Out] >> ([{'value': '200', 'unit': None}, {'value': '300', 'unit': None}],
                      ['two hundred', 'three hundred'])

            [In]  >>  self.processed_text = "one two three"
            [In]  >>  _detect_number_from_numerals()
            [Out] >> ([{'value': '2', 'unit': None}, {'value': '2', 'unit': None}, {'value': '3', 'unit': None}],
                      ['one', 'two', 'three'])

            *Notes*
                Some Limitations:
                i) Cannot detect decimals without the integer part. E.g. .25, .5, etc
                ii) Cannot detect one with "a/an". E.g. I want an apple
                iii) Detects wrong for multiple scales mentioned consecutively E.g. three hundred thousand,
                     hundred thousand
        """
        number_list = number_list or []
        original_list = original_list or []

        # Splitting text based on "-" and ":",  as in case of text "two thousand-three thousand", simple splitting
        # will give list as [two, thousand-three, thousand], result in number word detector giving wrong result,
        # hence we need to separate them into [two thousand, three thousand] using '-' or ':' as split char
        numeral_text_list = re.split(r'[\-\:]', self.processed_text)
        for numeral_text in numeral_text_list:
            numbers, original_texts = get_number_from_number_word(
                numeral_text, self.numbers_word_map)
            for number, original_text in zip(numbers, original_texts):
                unit = None
                if self.unit_type:
                    unit, original_text = self._get_unit_from_text(
                        original_text, numeral_text)
                numeral_text = numeral_text.replace(original_text, self.tag)
                number_list.append({
                    NUMBER_DETECTION_RETURN_DICT_VALUE:
                    str(number),
                    NUMBER_DETECTION_RETURN_DICT_UNIT:
                    unit
                })
                original_list.append(original_text)
        return number_list, original_list
 def test_get_number_with_scale_and_unit_in_number_word(self):
     """
     Number detection from word with scale and unit like - 'one hundred', 'one thousand two hundred five'
     """
     message = 'haptik get one thousand two hundred five messages daily'
     detect_texts, original_texts = get_number_from_number_word(
         message, self.number_words_map)
     zipped = list(zip(detect_texts, original_texts))
     self.assertEqual(len(zipped), 1)
     self.assertIn((1205, 'one thousand two hundred five'), zipped)
 def test_get_number_with_only_scale_in_number_word(self):
     """
     Number detection from word with only scale like - 'hundred', 'thousand'
     """
     message = 'need hundred change'
     detect_texts, original_texts = get_number_from_number_word(
         message, self.number_words_map)
     zipped = list(zip(detect_texts, original_texts))
     self.assertEqual(len(zipped), 1)
     self.assertIn((100, 'hundred'), zipped)
 def test_get_number_with_only_unit_in_number_word(self):
     """
     Number detection from word with only unit ex - 'one', 'two', 'twenty'
     """
     message = 'I want to book for one passenger'
     detect_texts, original_texts = get_number_from_number_word(
         message, self.number_words_map)
     zipped = list(zip(detect_texts, original_texts))
     self.assertEqual(len(zipped), 1)
     self.assertIn((1, 'one'), zipped)
 def test_get_number_with_multiple_spaces_in_unit_scale_number_word(self):
     """
     Number detection from word with multiple spaces in unit scale number like 'one thousand   one   hundred two'
     """
     message = 'there are one thousand   one   hundred two students attending placement drive'
     detect_texts, original_texts = get_number_from_number_word(
         message, self.number_words_map)
     zipped = list(zip(detect_texts, original_texts))
     self.assertEqual(len(zipped), 1)
     self.assertIn((1102, 'one thousand   one   hundred two'), zipped)
 def test_get_number_with_multiple_unit_scale_number_word(self):
     """
     Number detection from word with multiple unit scale numbers like 'one thousand to one hundred two'
     """
     message = 'one thousand to one hundred two'
     detect_texts, original_texts = get_number_from_number_word(
         message, self.number_words_map)
     zipped = list(zip(detect_texts, original_texts))
     self.assertEqual(len(zipped), 2)
     self.assertIn((1000, 'one thousand'), zipped)
     self.assertIn((102, 'one hundred two'), zipped)
 def test_get_number_with_multiple_unit_number_word(self):
     """
     Number detection from word with multiple unit numbers like 'one two three twenty one'
     """
     message = 'one two three'
     detect_texts, original_texts = get_number_from_number_word(
         message, self.number_words_map)
     zipped = list(zip(detect_texts, original_texts))
     self.assertEqual(len(zipped), 3)
     self.assertIn((1, 'one'), zipped)
     self.assertIn((2, 'two'), zipped)
     self.assertIn((3, 'three'), zipped)
 def test_get_number_with_combination_of_unit_scale_and_unit_number_word(
         self):
     """
     Number detection from word with combination of unit scale and unit number like 'one thousand one two three'
     """
     message = 'one thousand one two three'
     detect_texts, original_texts = get_number_from_number_word(
         message, self.number_words_map)
     zipped = list(zip(detect_texts, original_texts))
     self.assertEqual(len(zipped), 3)
     self.assertIn((1001, 'one thousand one'), zipped)
     self.assertIn((2, 'two'), zipped)
     self.assertIn((3, 'three'), zipped)
Пример #9
0
    def _detect_number_from_words(self, number_list=None, original_list=None):
        """
        Detect numbers from number words, for example - "two thousand", "One hundred twenty two".
        How it works?
            First it splits the text checking if any of '-' or ':' is present in text, and pass the split text
            to number word detector, which return the number value and original word from which it is being detected.
            Further we check for unit in suffix and prefix of original string and update that if any units are found.
        Args:
            number_list (list): list containing detected numeric text
            original_list (list): list containing original numeral text
        Returns:
            number_list (list): list containing updated detected numeric text
            original_list (list): list containing updated original numeral text

        Examples:
            [In]  >>  self.processed_text = "One hundred two"
            [In]  >>  _detect_number_from_numerals()
            [Out] >> ([{'value': '102', 'unit': None}], ['one hundred two two'])

            [In]  >>  self.processed_text = "two hundred - three hundred"
            [In]  >>  _detect_number_from_numerals()
            [Out] >> ([{'value': '200', 'unit': None}, {'value': '300', 'unit': None}],
                      ['two hundred', 'three hundred'])

            [In]  >>  self.processed_text = "one two three"
            [In]  >>  _detect_number_from_numerals()
            [Out] >> ([{'value': '2', 'unit': None}, {'value': '2', 'unit': None}, {'value': '3', 'unit': None}],
                      ['one', 'two', 'three'])

            *Notes*
                Some Limitations:
                i) Cannot detect decimals without the integer part. E.g. .25, .5, etc
                ii) Cannot detect one with "a/an". E.g. I want an apple
                iii) Detects wrong for multiple scales mentioned consecutively E.g. three hundred thousand,
                     hundred thousand
        """
        number_list = number_list or []
        original_list = original_list or []
        end_span = -1
        spans = []
        spanned_text = self.text

        # Splitting text based on "-" and ":",  as in case of text "two thousand-three thousand", simple splitting
        # will give list as [two, thousand-three, thousand], result in number word detector giving wrong result,
        # hence we need to separate them into [two thousand, three thousand] using '-' or ':' as split char
        numeral_text_list = re.split(r'[\-\:]', self.processed_text)
        for numeral_text in numeral_text_list:
            numbers, original_texts = get_number_from_number_word(
                numeral_text, self.numbers_word_map)
            for original in original_texts:
                span = re.search(original, spanned_text).span()
                start_span = end_span + span[0]
                end_span += span[1]
                spanned_text = spanned_text[span[1]:]
                spans.append((start_span, end_span))
            full_list = list(zip(numbers, original_texts, spans))
            sorted_full_list = sorted(full_list,
                                      key=lambda kv: len(kv[2]),
                                      reverse=False)
            for number, original_text, span in sorted_full_list:
                unit = None
                if self.unit_type:
                    unit, original_text = self._get_unit_from_text(
                        original_text, numeral_text)
                _pattern = re.compile(self._SPAN_BOUNDARY_TEMPLATE.format(
                    re.escape(original_text)),
                                      flags=_re_flags)
                if _pattern.search(numeral_text):
                    numeral_text = _pattern.sub(self.tag, numeral_text, 1)
                    number_list.append({
                        NUMBER_DETECTION_RETURN_DICT_VALUE:
                        str(number),
                        NUMBER_DETECTION_RETURN_DICT_UNIT:
                        unit,
                        NUMBER_DETECTION_RETURN_DICT_SPAN:
                        span
                    })
                    original_list.append(original_text)
        return number_list, original_list