def _fixup_range(start, end, mapping): extra = [] for i in xrange(start, end + 1): u = unichr(i) if u in mapping: extra.append(re_escape(u"".join(mapping[u]))) start = re_escape(unichr(start)) end = re_escape(unichr(end)) return u"%s%s-%s" % ("".join(extra), start, end)
def diacritic_for_letters(regenerate=False): """Returns a mapping for combining diacritic mark to ascii characters for which they can be used to combine to a single unicode char. (actually not ascii, but unicode from the Lu/Ll/Lt categories, but mainly ascii) Since this is quite expensive to compute, the result is a cached version unless regenerate != True. regenerate = True is used for unittests to validate the cache. """ if not regenerate: return _DIACRITIC_CACHE d = {} for i in xrange(sys.maxunicode): u = unichr(i) n = unicodedata.normalize("NFKD", u) if len(n) <= 1: continue if unicodedata.category(u) not in ("Lu", "Ll", "Lt"): continue if not all(map(unicodedata.combining, n[1:])): continue d.setdefault(n[1:], set()).add(n[0]) for k, v in d.items(): d[k] = u"".join(sorted(v)) return d
def _fixup_literal(literal, in_seq, mapping): u = unichr(literal) if u in mapping: u = u + u"".join(mapping[u]) need_seq = len(u) > 1 u = re_escape(u) if need_seq and not in_seq: u = u"[%s]" % u return u
def get_punctuation_mapping(regenerate=False): """This takes the unicode confusables set and extracts punctuation which looks similar to one or more ASCII punctuation. e.g. ' --> ' """ if not regenerate: return _PUNCT_CONFUSABLES_CACHE h = urlopen("http://www.unicode.org/Public/security/9.0.0/confusables.txt") data = h.read() mapping = {} for line in data.decode("utf-8-sig").splitlines(): line = line.strip() if not line: continue if line.startswith(u"#"): continue char, repls = line.split(";", 2)[:2] char = char.strip() repls = repls.split() to_uni = lambda x: unichr(int(x, 16)) char = to_uni(char) repls = [to_uni(r) for r in repls] def is_ascii(char): try: char.encode("ascii") except UnicodeEncodeError: return False return True def is_punct(char): return unicodedata.category(char).startswith("P") if all(is_ascii(c) and is_punct(c) for c in repls) and char: repls = u"".join(repls) mapping[repls] = mapping.get(repls, u"") + char # if any of the equal chars is also ascii + punct we can replace # it as well for ascii_, uni in mapping.items(): also_ascii = [c for c in uni if is_ascii(c) and is_punct(c)] for c in also_ascii: mapping[c] = uni.replace(c, u"") return mapping
def get_decomps_mapping(regenerate=False): """This takes the decomps.txt file of the Unicode UCA and gives us a cases where a letter can be decomposed for collation and that mapping isn't in NFKD. """ if not regenerate: return _UCA_DECOMPS_CACHE mapping = {} h = urlopen("http://unicode.org/Public/UCA/8.0.0/decomps.txt") for line in h.read().splitlines(): if line.startswith("#"): continue to_uni = lambda x: unichr(int(x, 16)) is_letter = lambda x: unicodedata.category(x) in ("Lu", "Ll", "Lt") cp, line = line.split(";", 1) tag, line = line.split(";", 1) decomp, line = line.split("#", 1) decomp = map(to_uni, decomp.strip().split()) cp = to_uni(cp) if not is_letter(cp): continue decomp = filter(is_letter, decomp) simple = "".join(decomp) if not simple: continue # skip anything we get from normalization if unicodedata.normalize("NFKD", cp)[0] == simple: continue mapping[simple] = mapping.get(simple, "") + cp return mapping
def _fixup_not_literal(literal, mapping): u = unichr(literal) return u"[^%s]" % u"".join(re_escape(u + u"".join(mapping.get(u, []))))
def _remove_punctuation_trans(): """Lookup all Unicode punctuation, and remove it""" return dict.fromkeys(i for i in xrange(sys.maxunicode) if unicodedata.category(unichr(i)).startswith('P'))
def _remove_punctuation_trans(): """Lookup all Unicode punctuation, and remove it""" return dict.fromkeys( i for i in xrange(sys.maxunicode) if unicodedata.category(unichr(i)).startswith('P'))
class Duplicates(SongsMenuPlugin, PluginConfigMixin): PLUGIN_ID = 'Duplicates' PLUGIN_NAME = _('Duplicates Browser') PLUGIN_DESC = _('Finds and displays similarly tagged versions of songs.') PLUGIN_ICON = Icons.EDIT_SELECT_ALL MIN_GROUP_SIZE = 2 _CFG_KEY_KEY = "key_expression" __DEFAULT_KEY_VALUE = "~artist~title~version" _CFG_REMOVE_WHITESPACE = 'remove_whitespace' _CFG_REMOVE_DIACRITICS = 'remove_diacritics' _CFG_REMOVE_PUNCTUATION = 'remove_punctuation' _CFG_CASE_INSENSITIVE = 'case_insensitive' plugin_handles = any_song(is_finite) # Cached values key_expression = None __cfg_cache = {} __remove_punctuation_trans = tbl = dict.fromkeys( i for i in xrange(sys.maxunicode) if unicodedata.category(unichr(i)).startswith('P')) """Lookup all Unicode punctuation, and remove it""" @classmethod def get_key_expression(cls): if not cls.key_expression: cls.key_expression = (cls.config_get(cls._CFG_KEY_KEY, cls.__DEFAULT_KEY_VALUE)) return cls.key_expression @classmethod def PluginPreferences(cls, window): def key_changed(entry): cls.key_expression = None cls.config_set(cls._CFG_KEY_KEY, entry.get_text().strip()) vb = Gtk.VBox(spacing=10) vb.set_border_width(0) hbox = Gtk.HBox(spacing=6) # TODO: construct a decent validator and use ValidatingEntry e = UndoEntry() e.set_text(cls.get_key_expression()) e.connect("changed", key_changed) e.set_tooltip_markup( _("Accepts QL tag expressions like " "<tt>~artist~title</tt> or <tt>musicbrainz_track_id</tt>")) lbl = Gtk.Label(label=_("_Group duplicates by:")) lbl.set_mnemonic_widget(e) lbl.set_use_underline(True) hbox.pack_start(lbl, False, True, 0) hbox.pack_start(e, True, True, 0) frame = qltk.Frame(label=_("Duplicate Key"), child=hbox) vb.pack_start(frame, True, True, 0) # Matching Option toggles = [ (cls._CFG_REMOVE_WHITESPACE, _("Remove _Whitespace")), (cls._CFG_REMOVE_DIACRITICS, _("Remove _Diacritics")), (cls._CFG_REMOVE_PUNCTUATION, _("Remove _Punctuation")), (cls._CFG_CASE_INSENSITIVE, _("Case _Insensitive")), ] vb2 = Gtk.VBox(spacing=6) for key, label in toggles: ccb = ConfigCheckButton(label, 'plugins', cls._config_key(key)) ccb.set_active(cls.config_get_bool(key)) vb2.pack_start(ccb, True, True, 0) frame = qltk.Frame(label=_("Matching options"), child=vb2) vb.pack_start(frame, False, True, 0) vb.show_all() return vb @staticmethod def remove_accents(s): return "".join(c for c in unicodedata.normalize('NFKD', text_type(s)) if not unicodedata.combining(c)) @classmethod def get_key(cls, song): key = song(cls.get_key_expression()) if cls.config_get_bool(cls._CFG_REMOVE_DIACRITICS): key = cls.remove_accents(key) if cls.config_get_bool(cls._CFG_CASE_INSENSITIVE): key = key.lower() if cls.config_get_bool(cls._CFG_REMOVE_PUNCTUATION): key = (key.translate(cls.__remove_punctuation_trans)) if cls.config_get_bool(cls._CFG_REMOVE_WHITESPACE): key = "_".join(key.split()) return key def plugin_songs(self, songs): model = DuplicatesTreeModel() self.__cfg_cache = {} # Index all songs by our custom key # TODO: make this cache-friendly print_d("Calculating duplicates for %d song(s)..." % len(songs)) groups = {} for song in songs: key = self.get_key(song) if key and key in groups: print_d("Found duplicate based on '%s'" % key) groups[key].add(song._song) elif key: groups[key] = {song._song} for song in app.library: key = self.get_key(song) if key in groups: groups[key].add(song) # Now display the grouped duplicates for (key, children) in groups.items(): if len(children) < self.MIN_GROUP_SIZE: continue # The parent (group) label model.add_group(key, children) dialog = DuplicateDialog(model) dialog.show()