def endslike(s, l): norm_s = unidecode.unidecode_expect_ascii(s).rstrip( string.punctuation).lower() norm_l = unidecode.unidecode_expect_ascii(l).lower() trunc = norm_s[:-len(norm_l)] return (norm_s.endswith(norm_l) and ((not trunc) or (not norm_s[:-len(norm_l)][-1].isalnum())))
def sanitize_for_path(s): """ Sanitize a string to be FAT/NTFS friendly when used in file path. """ s = s.translate(str.maketrans("/\\|*", "---x")) s = "".join(c for c in unidecode.unidecode_expect_ascii(s) if c in VALID_PATH_CHARS) s = s.strip() s = s.rstrip(".") # this if for FAT on Android return s
def lclean(self, s): """ Remove garbage at left of string. """ r = s.lstrip(__class__.LCLEAN_CHARS) c = unidecode.unidecode_expect_ascii(r).lstrip(__class__.LCLEAN_CHARS) if c != r: r = c return r
def _pre_clean(s: str) -> str: """Minimal possible pre-clean Still, e.g., allowing nickname extraction. """ s = str(s) s = unidecode_expect_ascii(s) s = s.lower() return s
def _count(word_counter, text): text = preprocess_text(text, is_stem=False) words = text.split() for word in words: # to remove accented unicode chars word = unidecode.unidecode_expect_ascii(word) if word in STOPWORDS: continue word_counter[word] = word_counter[word] + 1 print(word_counter)
def namesafe(argument): illegal = r'<.>\|?*:"/' legallist = [] legal = '' for character in argument: if character not in illegal: legallist.append(character) else: legallist.append('-') legal = ''.join(legallist) return (unidecode.unidecode_expect_ascii(legal))
def clean_name(raw, remove_spaces=False): """Clean each input name down to lowercase ascii letters and spaces""" working = raw # Strip unicode down to ascii (e.g. Ë becomes E; ñ becomes n) working = unidecode.unidecode_expect_ascii(working) # Make all lowercase working = working.lower() # Remove absolutely everything except the lowercase letters and spaces acceptable = string.ascii_lowercase if not remove_spaces: acceptable = acceptable + " " return "".join(c for c in working if c in acceptable)
def issue_30_check_fingerprint(entry): """ This helps ensure that we aren't losing random name parts. """ name = Name(entry["raw"]) rawish = unidecode_expect_ascii(entry["raw"]).lower() string_finger = fingerprint(str(name)) raw_finger = fingerprint(rawish) alt_finger = fingerprint(rawish.replace("junior", "jr")) assert string_finger in (raw_finger, alt_finger)
def _count(word_counter, text): """ Updates dictionary with word->count """ text = preprocess_text(text, min_len=0, is_stem=False) words = text.split() for word in words: # to remove accented unicode chars word = unidecode.unidecode_expect_ascii(word) if word in STOPWORDS: continue word_counter[word] = word_counter[word] + 1 print(word_counter)
def find_right_track(df, artist): ''' This function takes as arguments a dataframe generated by df_from_search and a string containing an artist name separated by single spaces. It returns the uri for the first row where the artist_name matches the artist argument passed to the function. It is case-insensitive. Ampersands are replaced with "and" and unicode characters are converted to ASCII using the unidecode library. ''' artist = unidecode.unidecode_expect_ascii(artist).lower() mapper = [] for i in df.index: if unidecode.unidecode_expect_ascii( df.loc[i]["artist_name"]).lower().replace("&", "and") == artist: mapper.append(i) try: return (df.loc[mapper].reset_index(drop=True).loc[0, "uri"]) except: return (pd.NA)
def isplural(word): """Checks if word is plural against WordNet. """ if word.lower() in extra_plural: return True, match_case(extra_plural[word.lower()], word) import unidecode word = unicode(word, 'utf8') word_ = unidecode.unidecode_expect_ascii(word) lemma = str(wnl.lemmatize(word_.lower(), 'n')) plural = True if word.lower() != lemma else False lemma = match_case(lemma, word) return plural, lemma
def normalize(text: str): text = text.lower() text = unidecode.unidecode_expect_ascii(text) if text == 'none': return '' text = text.replace('___', '_') text = re.sub(r'\s+', ' ', text) text = re.sub(r'\b(a )+', 'a ', text) text = re.sub(r'\b(an )+', 'an ', text) text = re.sub(r'\b(the )+', 'the ', text) text = re.sub(r'\b(person( |)|)x(?!-)\b', 'Alex', text) text = re.sub(r'\b(person( |)|)y(?!-)\b', 'Bob', text) text = re.sub(r'\b(person( |)|)z(?!-)\b', 'Charlie', text) if not text.endswith('.'): text += '.' return text
def token_to_char_ids(token, max_len=MAX_WORD_LEN): result = np.zeros((max_len, MAX_CHARS)) token = unidecode.unidecode_expect_ascii(token).lower() for i in range(min(len(token), max_len)): ch = token[i] ch_num = ord(ch) - ord('a') dig_num = ord(ch) - ord('0') if 0 <= ch_num < LETTER_COUNT: result[i, ch_num] = 1.0 elif 0 <= dig_num < DIGIT_COUNT: result[i, dig_num + LETTER_COUNT] = 1.0 elif ch in CHARS_MAP: result[i, LETTER_COUNT + DIGIT_COUNT + CHARS_MAP[ch]] = 1.0 else: result[i, -1] = 1.0 return result
def modify_filename(filename, prefix=None, suffix=None): filename = unidecode.unidecode_expect_ascii(filename) whitelist = '. _-' filename = re.sub(r'[^\w' + whitelist + ']', '---', filename) fname, fext = os.path.splitext(filename) # Windows API has limit of 260 characters fname = fname[:75] if prefix is not None: fname = '{}_{}'.format(prefix, fname) if suffix is not None: fname = '{}_{}'.format(fname, suffix) filename = '{}{}'.format(fname, fext) # Remove consequetive dots pattern = re.compile(r'\.{2,}') filename = pattern.sub('.', filename) return filename
def get_normalize_title(self, title): return (' '.join( unidecode.unidecode_expect_ascii(title).replace( "-", " ").split())).replace(" ", "-").replace("/", "-")
def rmprefix(self, s, e): """ Remove string prefix. """ return s.lstrip(string.punctuation)[len(unidecode.unidecode_expect_ascii(e)):]
def rmsuffix(self, s, e): """ Remove string suffix. """ return s.rstrip(string.punctuation)[:-len(unidecode.unidecode_expect_ascii(e))]
def lnorm(self, s): return unidecode.unidecode_expect_ascii(s).lstrip(string.punctuation).lower()
def ignore_non_ascii(text: str) -> str: return unidecode_expect_ascii(text).lower()
def rmprefix(s, e): return s.lstrip( string.punctuation)[len(unidecode.unidecode_expect_ascii(e)):]
def rmsuffix(s, e): return s.rstrip( string.punctuation)[:-len(unidecode.unidecode_expect_ascii(e))]
def lclean(s): r = s.lstrip(lclean_chars) c = unidecode.unidecode_expect_ascii(r).lstrip(lclean_chars) if c != r: r = c return r
def startslike(s, l): return unidecode.unidecode_expect_ascii(s).lstrip( string.punctuation).lower().startswith( unidecode.unidecode_expect_ascii(l).rstrip( string.punctuation).lower())