Пример #1
0
    def test_normalize_extra(self):
        tests = [
            ('TeSt', 'test'),
            ('Côte-d\'Azur', 'cote d azur')
        ]

        for inp, out in tests:
            self.assertEqual(out, normalize(inp, Level.HARD))
Пример #2
0
    def match_capitalization_and_diacritic(abbrv: str, original: str) -> str:
        """Matches the capitalization and diacritics of the `original` word, as long as they are similar
        """

        abbrv = list(normalize(abbrv, Level.SOFT))
        for i, c in enumerate(abbrv):
            unided = unidecode(original[i])
            if unidecode(c) in [unided.lower(), unided.upper()]:
                abbrv[i] = original[i]

        return ''.join(abbrv)
Пример #3
0
    def test_normalize(self):
        tests = [
            ('test', 'test'),
            ('abbréviation', 'abbreviation'),
            ('ačaruli', 'acaruli'),
            ('għall', 'ghall'),
            ('chrysopœia', 'chrysopoeia'),
            ('Côte-d\'Azur', 'Cote-d\'Azur')
        ]

        for inp, out in tests:
            self.assertEqual(out, normalize(inp, Level.NORMAL))
Пример #4
0
    def normalize(inp: str) -> str:
        """Normalization for the patterns (and keys): unidecode + lowering
        """

        return normalize(inp, Level.NORMAL).lower()
Пример #5
0
    def __call__(self,
                 title: str,
                 remove_part: bool = True,
                 langs: List[str] = None) -> str:
        """Abbreviate a title according to the rules of Section 7 in the ISSN manual
        (https://www.issn.org/understanding-the-issn/assignment-rules/issn-manual/)

        TODO:
        - Section 7.1.2 (one word + qualifying information)
        - Section 7.1.3 (one word + supplement)
        - Section 7.1.7 (keep prepositions in expressions like "in vivo" and
          keep place/personal name intact, such as "Los Alamos")
        - Section 7.1.8 (acronyms and initialism)
        - Section 7.1.11 is unclear on whether PART should be kept or not
        """

        result = ''
        is_first = True

        title_soft_normalized = normalize(title, Level.SOFT)
        title_normalized = Pattern.normalize(title)

        lexer = Lexer(title_soft_normalized, self.stopwords)
        tokens = []
        prev_article = None

        # filter tokens
        for token in lexer.tokenize():
            # Remove all articles, as per Section 7.1.7
            if token.type == TokenType.ARTICLE:
                prev_article = token
                continue
            # Remove stopwords, except if it is first, as per Section 7.1.7
            elif token.type == TokenType.STOPWORD and not is_first:
                continue

            elif token.type == TokenType.SYMBOLS:
                # Omit comma, replace point by comma, as per Section 7.1.6 (also remove ellipsis)
                token.value = token.value.replace(',', '').replace(
                    '.', ',').replace(',,,', '')

                # remove & and + when they are used as "and", as per Section 7.1.10
                if token.value == '&':
                    continue

            # remove part, as suggested per Section 7.1.11 (but keep that optional, since the rule is unclear)
            elif token.type == TokenType.ORDINAL and tokens[
                    -1].type == TokenType.PART and remove_part:
                tokens = tokens[:-1]

            # add previous article if followed by a symbol or nothing (was actually an ORDINAL!)
            if prev_article is not None:
                if token.type in [TokenType.SYMBOLS, TokenType.EOS]:
                    tokens.append(prev_article)
                prev_article = None

            # keep the token only it contains something
            if token.type != TokenType.EOS and token.value != '':
                tokens.append(token)

            is_first = False

        # do not abbreviate title which consists of one word (as per Section 7.1.1)
        if len(tokens) == 1:
            result = tokens[0].value
        # when the title is one word with an initial preposition, it is not abbreviated (as per Section 7.1.1)
        elif len(tokens) == 2 and tokens[0].type == TokenType.STOPWORD:
            result = '{} {}'.format(tokens[0].value, tokens[1].value)
        # when the title is one word and a final symbol, it is not abbreviated (as per Section 7.1.1?)
        elif len(tokens) == 2 and tokens[1].type == TokenType.SYMBOLS:
            result = '{}{}'.format(tokens[0].value, tokens[1].value)
        # otherwise, abbreviate WORD and PART according to LTWA
        else:
            is_hyphenated = False
            no_space = False
            next_position = 0
            ligatures_shift = 0

            for token in tokens:
                abbrv = token.value

                if token.type == TokenType.HYPHEN:
                    is_hyphenated = True
                elif token.type in [TokenType.WORD, TokenType.PART]:
                    if token.position >= next_position:
                        abbrv, len_ = self.abbreviate(
                            title_normalized[token.position +
                                             ligatures_shift:], token.value,
                            title_soft_normalized[token.position:], langs)
                        next_position = token.position + len_
                    else:
                        abbrv = ''
                        no_space = True
                elif token.type in [TokenType.SYMBOLS, TokenType.HYPHEN]:
                    no_space = True

                result += '{}{}'.format(
                    ' ' if not (len(result) == 0 or is_hyphenated or no_space)
                    else '', abbrv)

                ligatures_shift += number_of_ligatures(token.value)
                no_space = False
                if token.type != TokenType.HYPHEN:
                    is_hyphenated = False

        return result