Пример #1
0
 def endslike(s, l):
     norm_s = unidecode.unidecode_expect_ascii(s).rstrip(
         string.punctuation).lower()
     norm_l = unidecode.unidecode_expect_ascii(l).lower()
     trunc = norm_s[:-len(norm_l)]
     return (norm_s.endswith(norm_l)
             and ((not trunc) or (not norm_s[:-len(norm_l)][-1].isalnum())))
Пример #2
0
def sanitize_for_path(s):
  """ Sanitize a string to be FAT/NTFS friendly when used in file path. """
  s = s.translate(str.maketrans("/\\|*", "---x"))
  s = "".join(c for c in unidecode.unidecode_expect_ascii(s) if c in VALID_PATH_CHARS)
  s = s.strip()
  s = s.rstrip(".")  # this if for FAT on Android
  return s
Пример #3
0
 def lclean(self, s):
   """ Remove garbage at left of string. """
   r = s.lstrip(__class__.LCLEAN_CHARS)
   c = unidecode.unidecode_expect_ascii(r).lstrip(__class__.LCLEAN_CHARS)
   if c != r:
     r = c
   return r
Пример #4
0
def sanitize_for_path(s):
  """ Sanitize a string to be FAT/NTFS friendly when used in file path. """
  s = s.translate(str.maketrans("/\\|*", "---x"))
  s = "".join(c for c in unidecode.unidecode_expect_ascii(s) if c in VALID_PATH_CHARS)
  s = s.strip()
  s = s.rstrip(".")  # this if for FAT on Android
  return s
Пример #5
0
    def _pre_clean(s: str) -> str:
        """Minimal possible pre-clean

        Still, e.g., allowing nickname extraction.
        """
        s = str(s)
        s = unidecode_expect_ascii(s)
        s = s.lower()
        return s
Пример #6
0
def _count(word_counter, text):
    text = preprocess_text(text, is_stem=False)
    words = text.split()
    for word in words:
        # to remove accented unicode chars
        word = unidecode.unidecode_expect_ascii(word)
        if word in STOPWORDS:
            continue
        word_counter[word] = word_counter[word] + 1
    print(word_counter)
Пример #7
0
def namesafe(argument):
    illegal = r'<.>\|?*:"/'
    legallist = []
    legal = ''
    for character in argument:
        if character not in illegal:
            legallist.append(character)
        else:
            legallist.append('-')
    legal = ''.join(legallist)
    return (unidecode.unidecode_expect_ascii(legal))
Пример #8
0
def clean_name(raw, remove_spaces=False):
    """Clean each input name down to lowercase ascii letters and spaces"""
    working = raw
    # Strip unicode down to ascii (e.g. Ë becomes E; ñ becomes n)
    working = unidecode.unidecode_expect_ascii(working)
    # Make all lowercase
    working = working.lower()
    # Remove absolutely everything except the lowercase letters and spaces
    acceptable = string.ascii_lowercase
    if not remove_spaces:
        acceptable = acceptable + " "
    return "".join(c for c in working if c in acceptable)
Пример #9
0
def issue_30_check_fingerprint(entry):
    """
    This helps ensure that we aren't losing random name parts.
    """
    name = Name(entry["raw"])
    rawish = unidecode_expect_ascii(entry["raw"]).lower()

    string_finger = fingerprint(str(name))
    raw_finger = fingerprint(rawish)
    alt_finger = fingerprint(rawish.replace("junior", "jr"))

    assert string_finger in (raw_finger, alt_finger)
Пример #10
0
def _count(word_counter, text):
    """
    Updates dictionary with word->count
    """
    text = preprocess_text(text, min_len=0, is_stem=False)
    words = text.split()
    for word in words:
        # to remove accented unicode chars
        word = unidecode.unidecode_expect_ascii(word)
        if word in STOPWORDS:
            continue
        word_counter[word] = word_counter[word] + 1
    print(word_counter)
def find_right_track(df, artist):
    '''
    This function takes as arguments a dataframe generated by df_from_search and a string containing an artist name 
    separated by single spaces.
    
    It returns the uri for the first row where the artist_name matches the artist argument passed to the function.
    
    It is case-insensitive. Ampersands are replaced with "and" and unicode characters are converted to ASCII using the 
    unidecode library.
    '''
    artist = unidecode.unidecode_expect_ascii(artist).lower()

    mapper = []

    for i in df.index:
        if unidecode.unidecode_expect_ascii(
                df.loc[i]["artist_name"]).lower().replace("&",
                                                          "and") == artist:
            mapper.append(i)

    try:
        return (df.loc[mapper].reset_index(drop=True).loc[0, "uri"])
    except:
        return (pd.NA)
Пример #12
0
def isplural(word):
    """Checks if word is plural against WordNet.
    """
    if word.lower() in extra_plural:
        return True, match_case(extra_plural[word.lower()], word)

    import unidecode 
    word = unicode(word, 'utf8')
    word_ = unidecode.unidecode_expect_ascii(word)

    lemma = str(wnl.lemmatize(word_.lower(), 'n'))
    plural = True if word.lower() != lemma else False
    lemma = match_case(lemma, word)
    
    return plural, lemma
Пример #13
0
def normalize(text: str):
    text = text.lower()
    text = unidecode.unidecode_expect_ascii(text)
    if text == 'none':
        return ''
    text = text.replace('___', '_')
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\b(a )+', 'a ', text)
    text = re.sub(r'\b(an )+', 'an ', text)
    text = re.sub(r'\b(the )+', 'the ', text)
    text = re.sub(r'\b(person( |)|)x(?!-)\b', 'Alex', text)
    text = re.sub(r'\b(person( |)|)y(?!-)\b', 'Bob', text)
    text = re.sub(r'\b(person( |)|)z(?!-)\b', 'Charlie', text)
    if not text.endswith('.'):
        text += '.'
    return text
Пример #14
0
def token_to_char_ids(token, max_len=MAX_WORD_LEN):
    result = np.zeros((max_len, MAX_CHARS))
    token = unidecode.unidecode_expect_ascii(token).lower()

    for i in range(min(len(token), max_len)):
        ch = token[i]
        ch_num = ord(ch) - ord('a')
        dig_num = ord(ch) - ord('0')
        if 0 <= ch_num < LETTER_COUNT:
            result[i, ch_num] = 1.0
        elif 0 <= dig_num < DIGIT_COUNT:
            result[i, dig_num + LETTER_COUNT] = 1.0
        elif ch in CHARS_MAP:
            result[i, LETTER_COUNT + DIGIT_COUNT + CHARS_MAP[ch]] = 1.0
        else:
            result[i, -1] = 1.0

    return result
Пример #15
0
def modify_filename(filename, prefix=None, suffix=None):
    filename = unidecode.unidecode_expect_ascii(filename)

    whitelist = '. _-'
    filename = re.sub(r'[^\w' + whitelist + ']', '---', filename)

    fname, fext = os.path.splitext(filename)

    # Windows API has limit of 260 characters
    fname = fname[:75]

    if prefix is not None:
        fname = '{}_{}'.format(prefix, fname)
    if suffix is not None:
        fname = '{}_{}'.format(fname, suffix)

    filename = '{}{}'.format(fname, fext)

    # Remove consequetive dots
    pattern = re.compile(r'\.{2,}')
    filename = pattern.sub('.', filename)

    return filename
Пример #16
0
 def get_normalize_title(self, title):
     return (' '.join(
         unidecode.unidecode_expect_ascii(title).replace(
             "-", " ").split())).replace(" ", "-").replace("/", "-")
Пример #17
0
 def rmprefix(self, s, e):
   """ Remove string prefix. """
   return s.lstrip(string.punctuation)[len(unidecode.unidecode_expect_ascii(e)):]
Пример #18
0
 def rmsuffix(self, s, e):
   """ Remove string suffix. """
   return s.rstrip(string.punctuation)[:-len(unidecode.unidecode_expect_ascii(e))]
Пример #19
0
 def lnorm(self, s):
   return unidecode.unidecode_expect_ascii(s).lstrip(string.punctuation).lower()
Пример #20
0
def ignore_non_ascii(text: str) -> str:
    return unidecode_expect_ascii(text).lower()
Пример #21
0
 def rmprefix(s, e):
     return s.lstrip(
         string.punctuation)[len(unidecode.unidecode_expect_ascii(e)):]
Пример #22
0
 def rmsuffix(s, e):
     return s.rstrip(
         string.punctuation)[:-len(unidecode.unidecode_expect_ascii(e))]
Пример #23
0
 def lclean(s):
     r = s.lstrip(lclean_chars)
     c = unidecode.unidecode_expect_ascii(r).lstrip(lclean_chars)
     if c != r:
         r = c
     return r
Пример #24
0
 def startslike(s, l):
     return unidecode.unidecode_expect_ascii(s).lstrip(
         string.punctuation).lower().startswith(
             unidecode.unidecode_expect_ascii(l).rstrip(
                 string.punctuation).lower())