예제 #1
0
def seed_features(item, context):
    """Return a list of features equivalent to those used in the seedset."""
    ind, nsw, tag = item[0], item[1][0], item[1][1]
    out = [
           nsw in ['Mr.', 'Mrs.', 'Mr', 'Mrs'],
           nsw in ['i.e.', 'ie.', 'e.g.', 'eg.'],
           nsw.endswith('.') and nsw.istitle() and not acr_pattern.match(nsw),
           (nsw.isupper() and is_cons(nsw) and not (nsw in meas_dict
            and is_digbased(context[1])) and not acr_pattern.match(nsw)),
           (nsw in meas_dict or nsw in meas_dict_pl) and is_digbased(context[1]),
           (nsw in ampm or nsw in adbc) and is_digbased(context[1]),
           (nsw.istitle() and nsw.isalpha() and len(nsw) > 3 and not is_cons(nsw)),
           (((nsw.startswith("O'") or nsw.startswith("D'")) and nsw[2:].istitle())
           or (nsw.endswith("s'") and nsw[:-2].istitle())
           or (nsw.endswith("'s") and nsw[:-2].istitle())),
           (not (nsw.isupper() or nsw.endswith('s') and nsw[:-1].isupper())
            and (nsw.lower() in wordlist
            or (nsw[:-1].lower() in wordlist and nsw.endswith('s')))
            and nsw not in ampm),
           triple_rep(nsw) and len(nsw) > 3,
           bool(acr_pattern.match(nsw) and nsw not in meas_dict),
           nsw.isalpha() and nsw.islower() and len(nsw) > 3,
           nsw.endswith('s') and nsw[:-1].isupper(),
           nsw in element_dict,
           nsw.isalpha and nsw.islower() and len(nsw) > 2,
           nsw.lower() in abbrev_dict or nsw in ['St.', 'st.', 'St']
           ]
    return out
예제 #2
0
def tokenize_basic(text):
    guess = [d for w in text.split(' ') for d in w.split('\n')]
    out = []
    for i in range(len(guess) - 1):
        if not guess[i]:
            pass
        elif guess[i].isalpha():
            out.append(guess[i])
        elif guess[i][0] in ['(', '[', '{']:
            if guess[i][1] in [')', ']', '}']:
                out.extend([guess[i][0], guess[i][1:-1], guess[i][-1]])
            else:
                out.extend([guess[i][0], guess[i][1:]])
        elif guess[i][-1] in [')', ']', '}']:
            out.extend([guess[i][:-1], guess[i][-1]])
        elif guess[i][-1] in ['!', '?'] and guess[i][:-1].isalpha():
            out.extend([guess[i][:-1], guess[i][-1]])
        elif guess[i][-1] == '.' and guess[i][:-1].isalpha():
            following = guess[i + 1]
            if following.istitle() and following.lower() in wordlist:
                if following.lower() in names_lower:
                    if guess[i][:-1] in wordlist:
                        out.extend([guess[i][:-1], '.'])
                    else:
                        out.append(guess[i])
                else:
                    out.extend([guess[i][:-1], '.'])
            elif guess[i][-1] == '.' and is_digbased(guess[i][:-1]):
                out.extend([guess[i][:-1], '.'])
            else:
                out.append(guess[i])
        elif guess[i].endswith((',', ':', ';')):
            out.extend([guess[i][:-1], guess[i][-1]])
        else:
            out.append(guess[i])
    if not guess[-1]:
        pass
    elif guess[-1].isalpha():
        out.append(guess[-1])
    elif guess[-1][-1] in ['!', '?'] and guess[-1][:-1].isalpha():
        out.extend([guess[-1][:-1], guess[-1][-1]])
    elif guess[-1][-1] == '.' and guess[-1][:-1] in wordlist:
        out.extend([guess[-1][:-1], '.'])
    elif guess[-1][-1] == '.' and is_digbased(guess[-1][:-1]):
        out.extend([guess[-1][:-1], '.'])
    elif guess[-1].endswith((',', ':', ';')):
        out.extend([guess[-1][:-1], guess[-1][-1]])
    else:
        out.append(guess[-1])
    return out
예제 #3
0
def seed(dict_tup, text):
    """Assign a seedset label to the input tuple.

    Generate seeds for the seedset by assigning integer labels to obvious
    cases. Where there is no obvious case, '-1' is returned.
    """
    ind, nsw, tag = dict_tup[0], dict_tup[1][0], dict_tup[1][1]
    context = gen_frame((ind, (nsw, tag)), text)
    if nsw in ['Mr.', 'Mrs.', 'Mr', 'Mrs']:
        return 3
    elif nsw in ['i.e.', 'ie.', 'e.g.', 'eg.']:
        return 2
    elif nsw.endswith('.') and nsw.istitle() and not acr_pattern.match(nsw):
        return 1
    elif nsw.lower() in abbrev_dict or nsw in ['St.', 'st.', 'St']:
        return 1
    elif (nsw.isupper() and is_cons(nsw) and not (nsw in meas_dict
          and is_digbased(context[1]))):
            return 2
    elif nsw.endswith('s') and nsw[:-1].isupper():
        return 2
    elif (nsw in meas_dict or nsw in meas_dict_pl) and is_digbased(context[1]):
        return 1
    elif (nsw in ampm or nsw in adbc) and is_digbased(context[1]):
        return 2
    elif nsw.istitle() and nsw.isalpha() and len(nsw) > 3 and not is_cons(nsw):
        return 3
    elif (((nsw.startswith("O'") or nsw.startswith("D'")) and nsw[2:].istitle())
           or (nsw.endswith("s'") and nsw[:-2].istitle())):
               return 3
    elif nsw in element_dict:
        return 1
    elif (not (nsw.isupper() or nsw.endswith('s') and nsw[:-1].isupper())
          and (nsw.lower() in wordlist
          or (nsw[:-1].lower() in wordlist and nsw.endswith('s')))
         and nsw not in ampm):
            return 3
    elif triple_rep(nsw) and len(nsw) > 3:
        return 3
    elif nsw.isalpha() and nsw.islower() and len(nsw) > 3:
        return 3
    elif acr_pattern.match(nsw) and nsw not in meas_dict:
        return 2
    elif len(nsw) == 1:
        return 2
    elif nsw.isalpha and nsw.islower() and len(nsw) > 2:
        return 3
    else:
        return -1
예제 #4
0
def retagify(dic, verbose=True):
    """ Retag each part of a SPLT token as 'SPLT-ALPHA', 'SPLT-NUMB' or
    'SPLT-MISC'.
    """
    out = {}
    for ind, (it, tag) in dic.items():
        if verbose:
            sys.stdout.write("\r{} of {} retagged".format(len(out), len(dic)))
            sys.stdout.flush()
        if len(it) > 100:
            out.update({ind: (it, tag + 'MISC')})
        if is_digbased(it):
            out.update({ind: (it, tag + 'NUMB')})
        elif (only_alpha(it)
              and (not mixedcase_pattern.match(it) or len(it) <= 3 or
                   (it[-1] == 's' and not mixedcase_pattern.match(it[:-1])))):
            out.update({ind: (it, tag + 'ALPHA')})
        elif it in meas_dict:
            out.update({ind: (it, tag + 'ALPHA')})
        else:
            out.update({ind: (it, tag + 'MISC')})
    if verbose:
        sys.stdout.write("\r{} of {} retagged".format(len(out), len(dic)))
        sys.stdout.flush()
        print("\n")
    return out
예제 #5
0
def mixedalnum_split(nsw):
    """ Split tokens on transitions from letters to numbers or numbers to
    letters.
    """
    try:
        out = []
        ind = 0
        if nsw[0] in digits:
            cat = 'num'
        elif nsw[0].isalpha:
            cat = 'let'
        else:
            cat = 'punc'
        for i in range(1, len(nsw)):
            if nsw[i] in digits:
                if cat == 'num' or cat == 'punc':
                    pass
                else:
                    out.append(nsw[ind:i])
                    cat = 'num'
                    ind = i
            elif nsw[i].isalpha():
                if cat == 'let' or cat == 'punc':
                    pass
                else:
                    out.append(nsw[ind:i])
                    cat = 'let'
                    ind = i
            elif nsw[i] == '°' and cat == 'num' and nsw[i + 1:] in [
                    'C', 'F', 'K', 'Re'
            ]:
                out.append(nsw[ind:i])
                ind = i
                break
            else:
                pass
        out.append(nsw[ind:])
        if len(out) == 3 and is_digbased(
                out[0]) and out[2].isdigit() and out[1] + out[2] in meas_dict:
            out = [out[0], out[1] + out[2]]
        return out
    except (KeyboardInterrupt, SystemExit):
        raise
    except:
        return nsw
예제 #6
0
def expand_EXPN(nsw, i, text, user_abbrevs={}):
    """Expand abbreviations to best possible match. If no close matches,
       return nsw."""
    try:
        if user_abbrevs:
            abbrevs = create_user_abbrevs(user_abbrevs)
        else:
            abbrevs = abbrevs_orig
        if nsw in ['St.', 'st.', 'St']:
            if i < len(text):
                if text[i + 1].lower() in names_lower:
                    return 'Saint'
                elif text[i + 1].endswith("'s"):
                    if text[i + 1][:-2].lower() in names_lower:
                        return 'Saint'
                elif text[i - 1].istitle():
                    return 'street'
                elif text[i + 1].istitle():
                    return 'Saint'
        elif nsw in meas_dict:
            if isinstance(i, int):
                if is_digbased(text[i - 1]):
                    if text[i - 1] == '1':
                        return meas_dict[nsw]
                    else:
                        return meas_dict_pl[nsw]
            else:
                full = text[int(i)]
                index = full.find(nsw)
                if index == 0:
                    if is_digbased(text[int(i) - 1]):
                        if text[int(i) - 1] == '1':
                            return meas_dict[nsw]
                        else:
                            return meas_dict_pl[nsw]
                else:
                    if is_digbased(full[:index]):
                        if text[int(i) - 1] == '1':
                            return meas_dict[nsw]
                        else:
                            return meas_dict_pl[nsw]
        elif (nsw.endswith('.') and nsw[:-1] in meas_dict
              and is_digbased(text[i - 1])):
            if text[i - 1] == '1':
                return meas_dict[nsw[:-1]]
            else:
                return meas_dict_pl[nsw[:-1]]
        if nsw.endswith('.') and nsw[:-1].lower() in abbrevs:
            w = nsw[:-1]
        else:
            w = nsw
        if w.lower() in abbrevs:
            cands = abbrevs[w.lower()]
            true_tag = abbrev_tag(i, text)
            true_tag_univ = abbrev_tag_univ(i, text)
            if len(cands) == 1:
                cand = cands[0]
                if pos_tag_dict_univ[cand.lower()] in [true_tag_univ, tuple()]:
                    return cand
            matches = []
            for cand in cands:
                if true_tag in pos_tag_dict[cand.lower()]:
                    matches += [cand]
            if not matches:
                for cand in cands:
                    if true_tag_univ in pos_tag_dict_univ[cand.lower()]:
                        matches += [cand]
            if matches:
                best = 0
                current = []
                if len(matches) == 1:
                    return matches[0]
                for cand in matches:
                    olap = overlap(i, cand, text)
                    if olap > best and cand in brown_common:
                        best = olap
                        current = [cand]
                    elif olap == best and best != 0:
                        current.append(cand)
                    elif cand in states.values() and not current:
                        current.append(cand)
                best = 0
                exp = ''
                for c in current:
                    if c in states.values():
                        return c
                    elif c in brown_common:
                        freq = brown_common[c]
                    else:
                        freq = 0
                    if freq < best:
                        best = freq
                        exp = c
            else:
                exp = maximum_overlap(w, i, text)
        elif w.lower().endswith('s.') and w.lower()[:-2] in abbrevs:
            return expand_EXPN(w.lower()[:-2], i, text) + 's'
        elif w.lower().endswith('s') and w.lower()[:-1] in abbrevs:
            return expand_EXPN(w.lower()[:-1], i, text) + 's'
        else:
            exp = maximum_overlap(w, i, text)
        if exp == '':
            return nsw
        else:
            return exp
    except (KeyboardInterrupt, SystemExit):
        raise
    except LookupError:
        raise
    except:
        return nsw