예제 #1
0
 def _normalize(self):
     new_dict = {}
     conflicting_keys = []
     for key, value in self._dictionary.items():
         normalized = normalize_unicode(key)
         if key != normalized and normalized in self._dictionary:
             conflicting_keys.append(key)
         else:
             new_dict[normalized] = value
     for key in conflicting_keys:
         normalized = normalize_unicode(key)
         if key in (self.info.get('skip', []) + self.info.get('pertain', [])):
             new_dict[normalized] = self._dictionary[key]
     self._dictionary = new_dict
예제 #2
0
    def _generate_simplifications(self, normalize=False):
        simplifications = []
        for simplification in self.info.get('simplifications', []):
            c_simplification = {}
            key, value = list(simplification.items())[0]
            if normalize:
                key = normalize_unicode(key)

            if isinstance(value, int):
                c_simplification[key] = str(value)
            else:
                c_simplification[key] = normalize_unicode(value) if normalize else value

            simplifications.append(c_simplification)
        return simplifications
예제 #3
0
    def _generate_simplifications(self, normalize=False):
        simplifications = []
        for simplification in self.info.get('simplifications', []):
            c_simplification = {}
            key, value = list(simplification.items())[0]
            if normalize:
                key = normalize_unicode(key)

            if isinstance(value, int):
                c_simplification[key] = str(value)
            else:
                c_simplification[key] = normalize_unicode(value) if normalize else value

            simplifications.append(c_simplification)
        return simplifications
예제 #4
0
    def get_date_data(self, date_string, date_formats=None):
        """
        Parse string representing date and/or time in recognizable localized formats.
        Supports parsing multiple languages and timezones.

        :param date_string:
            A string representing date and/or time in a recognizably valid format.
        :type date_string: str|unicode
        :param date_formats:
            A list of format strings using directives as given
            `here <https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior>`_.
            The parser applies formats one by one, taking into account the detected languages.
        :type date_formats: list

        :return: a dict mapping keys to :mod:`datetime.datetime` object and *period*. For example:
            {'date_obj': datetime.datetime(2015, 6, 1, 0, 0), 'period': u'day'}

        :raises: ValueError - Unknown Language

        .. note:: *Period* values can be a 'day' (default), 'week', 'month', 'year'.

        *Period* represents the granularity of date parsed from the given string.

        In the example below, since no day information is present, the day is assumed to be current
        day ``16`` from *current date* (which is June 16, 2015, at the moment of writing this).
        Hence, the level of precision is ``month``:

            >>> DateDataParser().get_date_data(u'March 2015')
            {'date_obj': datetime.datetime(2015, 3, 16, 0, 0), 'period': u'month'}

        Similarly, for date strings with no day and month information present, level of precision
        is ``year`` and day ``16`` and month ``6`` are from *current_date*.

            >>> DateDataParser().get_date_data(u'2014')
            {'date_obj': datetime.datetime(2014, 6, 16, 0, 0), 'period': u'year'}

        Dates with time zone indications or UTC offsets are returned in UTC time unless
        specified using `Settings`_.

            >>> DateDataParser().get_date_data(u'23 March 2000, 1:21 PM CET')
            {'date_obj': datetime.datetime(2000, 3, 23, 14, 21), 'period': 'day'}

        """
        try:
            date_string = date_string.strip()
        except AttributeError:
            raise TypeError('Input type must be str or unicode')
        if self._settings.NORMALIZE:
            date_string = normalize_unicode(date_string)

        date_string = sanitize_date(date_string)

        for language in self.language_detector.iterate_applicable_languages(
                date_string, modify=True, settings=self._settings):
            parsed_date = _DateLanguageParser.parse(
                language, date_string, date_formats, settings=self._settings)
            if parsed_date:
                return parsed_date
        else:
            return {'date_obj': None, 'period': 'day'}
예제 #5
0
 def test_dates_parsing_with_normalization(self, date_string, expected):
     self.given_local_tz_offset(0)
     self.given_parser(settings={"NORMALIZE": True, "RELATIVE_BASE": datetime(2012, 11, 13)})
     self.when_date_is_parsed(normalize_unicode(date_string))
     self.then_date_was_parsed_by_date_parser()
     self.then_period_is("day")
     self.then_date_obj_exactly_is(expected)
예제 #6
0
    def is_applicable(self, date_string, strip_timezone=False, settings=None):
        """
        Check if the locale is applicable to translate date string.

        :param date_string:
            A string representing date and/or time in a recognizably valid format.
        :type date_string: str

        :param strip_timezone:
            If True, timezone is stripped from date string.
        :type strip_timezone: bool

        :return: boolean value representing if the locale is applicable for the date string or not.
        """
        if strip_timezone:
            date_string, _ = pop_tz_offset_from_string(date_string,
                                                       as_offset=False)

        date_string = self._translate_numerals(date_string)
        if settings.NORMALIZE:
            date_string = normalize_unicode(date_string)
        date_string = self._simplify(date_string, settings=settings)
        dictionary = self._get_dictionary(settings)
        date_tokens = dictionary.split(date_string)
        return dictionary.are_tokens_valid(date_tokens)
예제 #7
0
 def when_all_languages_are_detected(self, date_strings, modify=False):
     assert not isinstance(date_strings, six.string_types)
     for date_string in date_strings:
         if settings.NORMALIZE:
             date_string = normalize_unicode(date_string)
         detected_languages = list(self.parser.iterate_applicable_languages(date_string, modify=modify, settings=settings))
     self.detected_languages = detected_languages
예제 #8
0
 def when_all_languages_are_detected(self, date_strings, modify=False):
     assert not isinstance(date_strings, six.string_types)
     for date_string in date_strings:
         if settings.NORMALIZE:
             date_string = normalize_unicode(date_string)
         detected_languages = list(self.parser.iterate_applicable_languages(date_string, modify=modify, settings=settings))
     self.detected_languages = detected_languages
예제 #9
0
 def test_dates_parsing_with_normalization(self, date_string, expected):
     self.given_local_tz_offset(0)
     self.given_parser(settings={'NORMALIZE': True,
                                 'RELATIVE_BASE': datetime(2012, 11, 13)})
     self.when_date_is_parsed(normalize_unicode(date_string))
     self.then_date_was_parsed_by_date_parser()
     self.then_period_is('day')
     self.then_date_obj_exactly_is(expected)
예제 #10
0
 def test_dates_parsing_with_normalization(self, date_string, expected):
     self.given_utcnow(datetime(2012, 11, 13))  # Tuesday
     self.given_local_tz_offset(0)
     self.given_parser(settings={'NORMALIZE': True})
     self.when_date_is_parsed(normalize_unicode(date_string))
     self.then_date_was_parsed_by_date_parser()
     self.then_period_is('day')
     self.then_date_obj_exactly_is(expected)
예제 #11
0
    def _simplify_split_align(self, original, settings):
        # TODO: Switch to new split method.
        original_tokens = self._word_split(original, settings=settings)
        simplified_tokens = self._word_split(self._simplify(
            normalize_unicode(original), settings=settings),
                                             settings=settings)
        if len(original_tokens) == len(simplified_tokens):
            return original_tokens, simplified_tokens

        elif len(original_tokens) < len(simplified_tokens):
            add_empty = False
            for i, token in enumerate(simplified_tokens):
                if i < len(original_tokens):
                    if token == normalize_unicode(original_tokens[i].lower()):
                        add_empty = False
                    else:
                        if not add_empty:
                            add_empty = True
                            continue
                        else:
                            original_tokens.insert(i, '')
                else:
                    original_tokens.insert(i, '')
        else:
            add_empty = False
            for i, token in enumerate(original_tokens):
                if i < len(simplified_tokens):
                    if normalize_unicode(
                            token.lower()) == simplified_tokens[i]:
                        add_empty = False
                    else:
                        if not add_empty:
                            add_empty = True
                            continue
                        else:
                            simplified_tokens.insert(i, '')
                else:
                    simplified_tokens.insert(i, '')

        while len(original_tokens) != len(simplified_tokens):
            if len(original_tokens) > len(simplified_tokens):
                original_tokens.remove('')
            else:
                simplified_tokens.remove('')
        return original_tokens, simplified_tokens
 def test_normalized_relative_dates(self, date_string, ago, period):
     date_string = normalize_unicode(date_string)
     self.given_parser(settings={'NORMALIZE': True})
     self.given_date_string(date_string)
     self.when_date_is_parsed()
     self.then_error_was_not_raised()
     self.then_date_was_parsed_by_freshness_parser()
     self.then_date_obj_is_exactly_this_time_ago(ago)
     self.then_period_is(period)
 def test_normalized_relative_dates(self, date_string, ago, period):
     date_string = normalize_unicode(date_string)
     self.given_parser(settings={'NORMALIZE': True})
     self.given_date_string(date_string)
     self.when_date_is_parsed()
     self.then_error_was_not_raised()
     self.then_date_was_parsed_by_freshness_parser()
     self.then_date_obj_is_exactly_this_time_ago(ago)
     self.then_period_is(period)
예제 #14
0
    def is_applicable(self, date_string, strip_timezone=False, settings=None):
        if settings.NORMALIZE:
            date_string = normalize_unicode(date_string)
        if strip_timezone:
            date_string, _ = pop_tz_offset_from_string(date_string, as_offset=False)

        date_string = self._simplify(date_string, settings=settings)
        tokens = self._split(date_string, keep_formatting=False, settings=settings)
        if self._is_date_consists_of_digits_only(tokens):
            return True
        else:
            return self._are_all_words_in_the_dictionary(tokens, settings)
예제 #15
0
    def translate(self, date_string, keep_formatting=False, settings=None):
        if settings.NORMALIZE:
            date_string = normalize_unicode(date_string)
        date_string = self._simplify(date_string, settings=settings)
        words = self._split(date_string, keep_formatting, settings=settings)

        dictionary = self._get_dictionary(settings)
        for i, word in enumerate(words):
            word = word.lower()
            if word in dictionary:
                words[i] = dictionary[word] or ''

        return self._join(
            list(filter(bool, words)), separator="" if keep_formatting else " ", settings=settings)
예제 #16
0
    def translate(self, date_string, keep_formatting=False, settings=None):
        if settings.NORMALIZE:
            date_string = normalize_unicode(date_string)
        date_string = self._simplify(date_string, settings=settings)
        words = self._split(date_string, keep_formatting, settings=settings)

        dictionary = self._get_dictionary(settings)
        for i, word in enumerate(words):
            word = word.lower()
            if word in dictionary:
                words[i] = dictionary[word] or ''

        return self._join(list(filter(bool, words)),
                          separator="" if keep_formatting else " ",
                          settings=settings)
예제 #17
0
    def is_applicable(self, date_string, strip_timezone=False, settings=None):
        if settings.NORMALIZE:
            date_string = normalize_unicode(date_string)
        if strip_timezone:
            date_string, _ = pop_tz_offset_from_string(date_string,
                                                       as_offset=False)

        date_string = self._simplify(date_string, settings=settings)
        tokens = self._split(date_string,
                             keep_formatting=False,
                             settings=settings)
        if self._is_date_consists_of_digits_only(tokens):
            return True
        else:
            return self._are_all_words_in_the_dictionary(tokens, settings)
예제 #18
0
 def _best_language(self, date_string,  settings=None):
     self.character_check(date_string, settings)
     date_string = normalize_unicode(date_string.lower())
     if len(self.languages) == 1:
         return self.languages[0].shortname
     applicable_languages = []
     for language in self.languages:
         num_words = language.count_applicability(
             date_string, strip_timezone=False, settings=settings)
         if num_words[0] > 0 or num_words[1] > 0:
             applicable_languages.append((language.shortname, num_words))
         else:
             num_words = language.count_applicability(
                 date_string, strip_timezone=True, settings=settings)
             if num_words[0] > 0 or num_words[1] > 0:
                 applicable_languages.append((language.shortname, num_words))
     if not applicable_languages:
         return None
     return max(applicable_languages, key=lambda p: (p[1][0], p[1][1]))[0]
예제 #19
0
 def _best_language(self, date_string, settings=None):
     self.character_check(date_string, settings)
     date_string = normalize_unicode(date_string.lower())
     if len(self.languages) == 1:
         return self.languages[0].shortname
     applicable_languages = []
     for language in self.languages:
         num_words = language.count_applicability(date_string,
                                                  strip_timezone=False,
                                                  settings=settings)
         if num_words[0] > 0 or num_words[1] > 0:
             applicable_languages.append((language.shortname, num_words))
         else:
             num_words = language.count_applicability(date_string,
                                                      strip_timezone=True,
                                                      settings=settings)
             if num_words[0] > 0 or num_words[1] > 0:
                 applicable_languages.append(
                     (language.shortname, num_words))
     if not applicable_languages:
         return None
     return max(applicable_languages, key=lambda p: (p[1][0], p[1][1]))[0]
예제 #20
0
    def translate(self, date_string, keep_formatting=False, settings=None):
        """
        Translate the date string to its English equivalent.

        :param date_string:
            A string representing date and/or time in a recognizably valid format.
        :type date_string: str

        :param keep_formatting:
            If True, retain formatting of the date string after translation.
        :type keep_formatting: bool

        :return: translated date string.
        """
        date_string = self._translate_numerals(date_string)
        if settings.NORMALIZE:
            date_string = normalize_unicode(date_string)
        date_string = self._simplify(date_string, settings=settings)
        dictionary = self._get_dictionary(settings)
        date_string_tokens = dictionary.split(date_string, keep_formatting)

        relative_translations = self._get_relative_translations(
            settings=settings)

        for i, word in enumerate(date_string_tokens):
            word = word.lower()
            for pattern, replacement in relative_translations.items():
                if pattern.match(word):
                    date_string_tokens[i] = pattern.sub(replacement, word)
            else:
                if word in dictionary:
                    date_string_tokens[i] = dictionary[word] or ''
        if "in" in date_string_tokens:
            date_string_tokens = self._clear_future_words(date_string_tokens)

        return self._join(list(filter(bool, date_string_tokens)),
                          separator="" if keep_formatting else " ",
                          settings=settings)
예제 #21
0
    def get_date_data(self, date_string, date_formats=None):
        """
        Parse string representing date and/or time in recognizable localized formats.
        Supports parsing multiple languages and timezones.

        :param date_string:
            A string representing date and/or time in a recognizably valid format.
        :type date_string: str|unicode
        :param date_formats:
            A list of format strings using directives as given
            `here <https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior>`_.
            The parser applies formats one by one, taking into account the detected languages.
        :type date_formats: list

        :return: a dict mapping keys to :mod:`datetime.datetime` object and *period*. For example:
            {'date_obj': datetime.datetime(2015, 6, 1, 0, 0), 'period': u'day'}

        :raises: ValueError - Unknown Language

        .. note:: *Period* values can be a 'day' (default), 'week', 'month', 'year'.

        *Period* represents the granularity of date parsed from the given string.

        In the example below, since no day information is present, the day is assumed to be current
        day ``16`` from *current date* (which is June 16, 2015, at the moment of writing this).
        Hence, the level of precision is ``month``:

            >>> DateDataParser().get_date_data(u'March 2015')
            {'date_obj': datetime.datetime(2015, 3, 16, 0, 0), 'period': u'month'}

        Similarly, for date strings with no day and month information present, level of precision
        is ``year`` and day ``16`` and month ``6`` are from *current_date*.

            >>> DateDataParser().get_date_data(u'2014')
            {'date_obj': datetime.datetime(2014, 6, 16, 0, 0), 'period': u'year'}

        Dates with time zone indications or UTC offsets are returned in UTC time unless
        specified using `Settings`_.

            >>> DateDataParser().get_date_data(u'23 March 2000, 1:21 PM CET')
            {'date_obj': datetime.datetime(2000, 3, 23, 14, 21), 'period': 'day'}

        """
        try:
            date_string = date_string.strip()
        except AttributeError:
            raise TypeError('Input type must be str or unicode')
        if self._settings.NORMALIZE:
            date_string = normalize_unicode(date_string)

        date_string = sanitize_date(date_string)

        for language in self.language_detector.iterate_applicable_languages(
                date_string, modify=True, settings=self._settings):
            parsed_date = _DateLanguageParser.parse(language,
                                                    date_string,
                                                    date_formats,
                                                    settings=self._settings)
            if parsed_date:
                return parsed_date
        else:
            return {'date_obj': None, 'period': 'day'}
예제 #22
0
 def given_string(self, datetime_string):
     if settings.NORMALIZE:
         datetime_string = normalize_unicode(datetime_string)
     self.datetime_string = datetime_string
예제 #23
0
 def given_string(self, datetime_string):
     if settings.NORMALIZE:
         datetime_string = normalize_unicode(datetime_string)
     self.datetime_string = datetime_string