def test_normalize_extra(self): tests = [ ('TeSt', 'test'), ('Côte-d\'Azur', 'cote d azur') ] for inp, out in tests: self.assertEqual(out, normalize(inp, Level.HARD))
def match_capitalization_and_diacritic(abbrv: str, original: str) -> str: """Matches the capitalization and diacritics of the `original` word, as long as they are similar """ abbrv = list(normalize(abbrv, Level.SOFT)) for i, c in enumerate(abbrv): unided = unidecode(original[i]) if unidecode(c) in [unided.lower(), unided.upper()]: abbrv[i] = original[i] return ''.join(abbrv)
def test_normalize(self): tests = [ ('test', 'test'), ('abbréviation', 'abbreviation'), ('ačaruli', 'acaruli'), ('għall', 'ghall'), ('chrysopœia', 'chrysopoeia'), ('Côte-d\'Azur', 'Cote-d\'Azur') ] for inp, out in tests: self.assertEqual(out, normalize(inp, Level.NORMAL))
def normalize(inp: str) -> str: """Normalization for the patterns (and keys): unidecode + lowering """ return normalize(inp, Level.NORMAL).lower()
def __call__(self, title: str, remove_part: bool = True, langs: List[str] = None) -> str: """Abbreviate a title according to the rules of Section 7 in the ISSN manual (https://www.issn.org/understanding-the-issn/assignment-rules/issn-manual/) TODO: - Section 7.1.2 (one word + qualifying information) - Section 7.1.3 (one word + supplement) - Section 7.1.7 (keep prepositions in expressions like "in vivo" and keep place/personal name intact, such as "Los Alamos") - Section 7.1.8 (acronyms and initialism) - Section 7.1.11 is unclear on whether PART should be kept or not """ result = '' is_first = True title_soft_normalized = normalize(title, Level.SOFT) title_normalized = Pattern.normalize(title) lexer = Lexer(title_soft_normalized, self.stopwords) tokens = [] prev_article = None # filter tokens for token in lexer.tokenize(): # Remove all articles, as per Section 7.1.7 if token.type == TokenType.ARTICLE: prev_article = token continue # Remove stopwords, except if it is first, as per Section 7.1.7 elif token.type == TokenType.STOPWORD and not is_first: continue elif token.type == TokenType.SYMBOLS: # Omit comma, replace point by comma, as per Section 7.1.6 (also remove ellipsis) token.value = token.value.replace(',', '').replace( '.', ',').replace(',,,', '') # remove & and + when they are used as "and", as per Section 7.1.10 if token.value == '&': continue # remove part, as suggested per Section 7.1.11 (but keep that optional, since the rule is unclear) elif token.type == TokenType.ORDINAL and tokens[ -1].type == TokenType.PART and remove_part: tokens = tokens[:-1] # add previous article if followed by a symbol or nothing (was actually an ORDINAL!) if prev_article is not None: if token.type in [TokenType.SYMBOLS, TokenType.EOS]: tokens.append(prev_article) prev_article = None # keep the token only it contains something if token.type != TokenType.EOS and token.value != '': tokens.append(token) is_first = False # do not abbreviate title which consists of one word (as per Section 7.1.1) if len(tokens) == 1: result = tokens[0].value # when the title is one word with an initial preposition, it is not abbreviated (as per Section 7.1.1) elif len(tokens) == 2 and tokens[0].type == TokenType.STOPWORD: result = '{} {}'.format(tokens[0].value, tokens[1].value) # when the title is one word and a final symbol, it is not abbreviated (as per Section 7.1.1?) elif len(tokens) == 2 and tokens[1].type == TokenType.SYMBOLS: result = '{}{}'.format(tokens[0].value, tokens[1].value) # otherwise, abbreviate WORD and PART according to LTWA else: is_hyphenated = False no_space = False next_position = 0 ligatures_shift = 0 for token in tokens: abbrv = token.value if token.type == TokenType.HYPHEN: is_hyphenated = True elif token.type in [TokenType.WORD, TokenType.PART]: if token.position >= next_position: abbrv, len_ = self.abbreviate( title_normalized[token.position + ligatures_shift:], token.value, title_soft_normalized[token.position:], langs) next_position = token.position + len_ else: abbrv = '' no_space = True elif token.type in [TokenType.SYMBOLS, TokenType.HYPHEN]: no_space = True result += '{}{}'.format( ' ' if not (len(result) == 0 or is_hyphenated or no_space) else '', abbrv) ligatures_shift += number_of_ligatures(token.value) no_space = False if token.type != TokenType.HYPHEN: is_hyphenated = False return result