예제 #1
0
    def regex_or_list_maker(verb_list):
        """makes a regex from the list of words passed to it"""
        # add alternative spellings
        from dictionaries.word_transforms import usa_convert
        from pattern.en import lexeme
        uk_convert = {v: k for k, v in usa_convert.items()}
        to_add_to_verb_list = []
        for w in verb_list:
            if w in usa_convert.keys():
              to_add_to_verb_list.append(usa_convert[w])
        for w in verb_list:
            if w in uk_convert.keys():
              to_add_to_verb_list.append(uk_convert[w])
        verb_list = sorted(list(set(verb_list + to_add_to_verb_list)))

        verbforms = []
        for w in verb_list:
          forms = [form.replace("n't", "").replace(" not", "") for form in lexeme(w)]
          for f in forms:
              verbforms.append(f)
          # deal with contractions
          if w == 'be':
              be_conts = [r"'m", r"'re", r"'s"]
              for cont in be_conts:
                  verbforms.append(cont)
          if w == "have":
              have_conts = [r"'d", r"'s", r"'ve"]
              for cont in have_conts:
                  verbforms.append(cont)
        
        to_add = []
        for w in verbforms:
            if w in usa_convert.keys():
              to_add.append(usa_convert[w])
        for w in verbforms:
            if w in uk_convert.keys():
              to_add.append(uk_convert[w])
        verbforms = sorted(list(set(verbforms + to_add)))
        t = []

        # ensure unicode
        for w in verbforms:
            if type(w) != unicode:
                t.append(unicode(w, 'utf-8', errors = 'ignore'))
            else:
                t.append(w)
        verbforms = t
        
        if not regex:
            return verbforms
        else:
            return r'(?i)\b(' + r'|'.join(verbforms) + r')\b'
예제 #2
0
def get_both_spellings(verb_list):
    """add alternative spellings to verb_list"""
    from dictionaries.word_transforms import usa_convert
    uk_convert = {v: k for k, v in usa_convert.items()}
    to_add_to_verb_list = []
    for w in verb_list:
        if w in usa_convert.keys():
          to_add_to_verb_list.append(usa_convert[w])
    for w in verb_list:
        if w in uk_convert.keys():
          to_add_to_verb_list.append(uk_convert[w])
    verb_list = sorted(list(set(verb_list + to_add_to_verb_list)))
    return verb_list
예제 #3
0
def get_both_spellings(verb_list):
    """add alternative spellings to verb_list"""
    from dictionaries.word_transforms import usa_convert
    uk_convert = {v: k for k, v in usa_convert.items()}
    to_add_to_verb_list = []
    for w in verb_list:
        if w in usa_convert.keys():
            to_add_to_verb_list.append(usa_convert[w])
    for w in verb_list:
        if w in uk_convert.keys():
            to_add_to_verb_list.append(uk_convert[w])
    verb_list = sorted(list(set(verb_list + to_add_to_verb_list)))
    return verb_list
예제 #4
0
파일: editor.py 프로젝트: muranava/corpkit
 def convert_spell(df, convert_to = 'US', print_info = print_info):
     """turn dataframes into us/uk spelling"""
     from dictionaries.word_transforms import usa_convert
     if print_info:
         print 'Converting spelling ... \n'
     if convert_to == 'UK':
         usa_convert = {v: k for k, v in usa_convert.items()}
     fixed = []
     for val in list(df.columns):
         try:
             fixed.append(usa_convert[val])
         except:
             fixed.append(val)
     df.columns = fixed
     return df
예제 #5
0
 def convert_spell(df, convert_to='US', print_info=print_info):
     """turn dataframes into us/uk spelling"""
     from dictionaries.word_transforms import usa_convert
     if print_info:
         print('Converting spelling ... \n')
     if convert_to == 'UK':
         usa_convert = {v: k for k, v in list(usa_convert.items())}
     fixed = []
     for val in list(df.columns):
         try:
             fixed.append(usa_convert[val])
         except:
             fixed.append(val)
     df.columns = fixed
     return df
예제 #6
0
 def correct_spelling(a_string):
     if not spelling:
         return a_string
     from dictionaries.word_transforms import usa_convert
     if spelling.lower() == 'uk':
         usa_convert = {v: k for k, v in list(usa_convert.items())}
     spell_out = []
     bits = a_string.split('/')
     for index, i in enumerate(bits):
         converted = usa_convert.get(i.lower(), i)
         if i.islower() or preserve_case is False:
             converted = converted.lower()
         elif i.isupper() and preserve_case:
             converted = converted.upper()
         elif i.istitle() and preserve_case:
             converted = converted.title()
         bits[index] = converted
     r = '/'.join(bits)
     return r
예제 #7
0
 def correct_spelling(a_string):
     if not spelling:
         return a_string
     from dictionaries.word_transforms import usa_convert
     if spelling.lower() == 'uk':
         usa_convert = {v: k for k, v in list(usa_convert.items())}
     spell_out = []
     bits = a_string.split('/')
     for index, i in enumerate(bits):
         converted = usa_convert.get(i.lower(), i)
         if i.islower() or preserve_case is False:
             converted = converted.lower()
         elif i.isupper() and preserve_case:
             converted = converted.upper()
         elif i.istitle() and preserve_case:
             converted = converted.title()
         bits[index] = converted
     r = '/'.join(bits)
     return r
예제 #8
0
def add_verb_inflections(verb_list):
    """add verb inflections to verb_list"""
    from dictionaries.word_transforms import usa_convert
    uk_convert = {v: k for k, v in usa_convert.items()}
    from dictionaries.process_types import find_lexeme
    
    # get lexemes
    lexemes = load_verb_data()
    verbforms = []
    
    # for each verb, get or guess the inflections
    # make list of ALL VERBS IN ALL INFLECTIONS
    all_lists = [lst for lst in lexemes.values()]
    allverbs = []
    for lst in all_lists:
        for v in lst:
            if v:
                allverbs.append(v)
    allverbs = list(set(allverbs))
    # use dict first
    for w in verb_list:
        verbforms.append(w)
        try:
            wforms = lexemes[w]
        except KeyError:
            # if not in dict, if it's an inflection, forget it
            if w in allverbs:
                continue
            if "'" in w:
                continue
            # if it's a coinage, guess
            else:
                wforms = find_lexeme(w)
        # get list of unique forms
        forms = list(set([form.replace("n't", "").replace(" not", "") for form in wforms if form]))
      
        for f in forms:
            verbforms.append(f)
      
      # deal with contractions
        if w == 'be':
            be_conts = [r"'m", r"'re", r"'s"]
            for cont in be_conts:
                verbforms.append(cont)
        if w == "have":
            have_conts = [r"'d", r"'s", r"'ve"]
            for cont in have_conts:
                verbforms.append(cont)
    
    # go over again, and add both possible spellings
    to_add = []
    for w in verbforms:
        if w in usa_convert.keys():
          to_add.append(usa_convert[w])
    for w in verbforms:
        if w in uk_convert.keys():
          to_add.append(uk_convert[w])
    verbforms = sorted(list(set(verbforms + to_add)))

    # ensure unicode
    t = []
    for w in verbforms:
        if type(w) != unicode:
            t.append(unicode(w, 'utf-8', errors = 'ignore'))
        else:
            t.append(w)
    verbforms = t
    return verbforms
예제 #9
0
def add_verb_inflections(verb_list):
    """add verb inflections to verb_list"""
    from dictionaries.word_transforms import usa_convert
    uk_convert = {v: k for k, v in usa_convert.items()}
    from dictionaries.process_types import find_lexeme

    # get lexemes
    lexemes = load_verb_data()
    verbforms = []

    # for each verb, get or guess the inflections
    # make list of ALL VERBS IN ALL INFLECTIONS
    all_lists = [lst for lst in lexemes.values()]
    allverbs = []
    for lst in all_lists:
        for v in lst:
            if v:
                allverbs.append(v)
    allverbs = list(set(allverbs))
    # use dict first
    for w in verb_list:
        verbforms.append(w)
        try:
            wforms = lexemes[w]
        except KeyError:
            # if not in dict, if it's an inflection, forget it
            if w in allverbs:
                continue
            if "'" in w:
                continue
            # if it's a coinage, guess
            else:
                wforms = find_lexeme(w)
        # get list of unique forms
        forms = list(
            set([
                form.replace("n't", "").replace(" not", "") for form in wforms
                if form
            ]))

        for f in forms:
            verbforms.append(f)

    # deal with contractions
        if w == 'be':
            be_conts = [r"'m", r"'re", r"'s"]
            for cont in be_conts:
                verbforms.append(cont)
        if w == "have":
            have_conts = [r"'d", r"'s", r"'ve"]
            for cont in have_conts:
                verbforms.append(cont)

    # go over again, and add both possible spellings
    to_add = []
    for w in verbforms:
        if w in usa_convert.keys():
            to_add.append(usa_convert[w])
    for w in verbforms:
        if w in uk_convert.keys():
            to_add.append(uk_convert[w])
    verbforms = sorted(list(set(verbforms + to_add)))

    # ensure unicode
    t = []
    for w in verbforms:
        t.append(w)
    return t