def seed_features(item, context): """Return a list of features equivalent to those used in the seedset.""" ind, nsw, tag = item[0], item[1][0], item[1][1] out = [ nsw in ['Mr.', 'Mrs.', 'Mr', 'Mrs'], nsw in ['i.e.', 'ie.', 'e.g.', 'eg.'], nsw.endswith('.') and nsw.istitle() and not acr_pattern.match(nsw), (nsw.isupper() and is_cons(nsw) and not (nsw in meas_dict and is_digbased(context[1])) and not acr_pattern.match(nsw)), (nsw in meas_dict or nsw in meas_dict_pl) and is_digbased(context[1]), (nsw in ampm or nsw in adbc) and is_digbased(context[1]), (nsw.istitle() and nsw.isalpha() and len(nsw) > 3 and not is_cons(nsw)), (((nsw.startswith("O'") or nsw.startswith("D'")) and nsw[2:].istitle()) or (nsw.endswith("s'") and nsw[:-2].istitle()) or (nsw.endswith("'s") and nsw[:-2].istitle())), (not (nsw.isupper() or nsw.endswith('s') and nsw[:-1].isupper()) and (nsw.lower() in wordlist or (nsw[:-1].lower() in wordlist and nsw.endswith('s'))) and nsw not in ampm), triple_rep(nsw) and len(nsw) > 3, bool(acr_pattern.match(nsw) and nsw not in meas_dict), nsw.isalpha() and nsw.islower() and len(nsw) > 3, nsw.endswith('s') and nsw[:-1].isupper(), nsw in element_dict, nsw.isalpha and nsw.islower() and len(nsw) > 2, nsw.lower() in abbrev_dict or nsw in ['St.', 'st.', 'St'] ] return out
def tokenize_basic(text): guess = [d for w in text.split(' ') for d in w.split('\n')] out = [] for i in range(len(guess) - 1): if not guess[i]: pass elif guess[i].isalpha(): out.append(guess[i]) elif guess[i][0] in ['(', '[', '{']: if guess[i][1] in [')', ']', '}']: out.extend([guess[i][0], guess[i][1:-1], guess[i][-1]]) else: out.extend([guess[i][0], guess[i][1:]]) elif guess[i][-1] in [')', ']', '}']: out.extend([guess[i][:-1], guess[i][-1]]) elif guess[i][-1] in ['!', '?'] and guess[i][:-1].isalpha(): out.extend([guess[i][:-1], guess[i][-1]]) elif guess[i][-1] == '.' and guess[i][:-1].isalpha(): following = guess[i + 1] if following.istitle() and following.lower() in wordlist: if following.lower() in names_lower: if guess[i][:-1] in wordlist: out.extend([guess[i][:-1], '.']) else: out.append(guess[i]) else: out.extend([guess[i][:-1], '.']) elif guess[i][-1] == '.' and is_digbased(guess[i][:-1]): out.extend([guess[i][:-1], '.']) else: out.append(guess[i]) elif guess[i].endswith((',', ':', ';')): out.extend([guess[i][:-1], guess[i][-1]]) else: out.append(guess[i]) if not guess[-1]: pass elif guess[-1].isalpha(): out.append(guess[-1]) elif guess[-1][-1] in ['!', '?'] and guess[-1][:-1].isalpha(): out.extend([guess[-1][:-1], guess[-1][-1]]) elif guess[-1][-1] == '.' and guess[-1][:-1] in wordlist: out.extend([guess[-1][:-1], '.']) elif guess[-1][-1] == '.' and is_digbased(guess[-1][:-1]): out.extend([guess[-1][:-1], '.']) elif guess[-1].endswith((',', ':', ';')): out.extend([guess[-1][:-1], guess[-1][-1]]) else: out.append(guess[-1]) return out
def seed(dict_tup, text): """Assign a seedset label to the input tuple. Generate seeds for the seedset by assigning integer labels to obvious cases. Where there is no obvious case, '-1' is returned. """ ind, nsw, tag = dict_tup[0], dict_tup[1][0], dict_tup[1][1] context = gen_frame((ind, (nsw, tag)), text) if nsw in ['Mr.', 'Mrs.', 'Mr', 'Mrs']: return 3 elif nsw in ['i.e.', 'ie.', 'e.g.', 'eg.']: return 2 elif nsw.endswith('.') and nsw.istitle() and not acr_pattern.match(nsw): return 1 elif nsw.lower() in abbrev_dict or nsw in ['St.', 'st.', 'St']: return 1 elif (nsw.isupper() and is_cons(nsw) and not (nsw in meas_dict and is_digbased(context[1]))): return 2 elif nsw.endswith('s') and nsw[:-1].isupper(): return 2 elif (nsw in meas_dict or nsw in meas_dict_pl) and is_digbased(context[1]): return 1 elif (nsw in ampm or nsw in adbc) and is_digbased(context[1]): return 2 elif nsw.istitle() and nsw.isalpha() and len(nsw) > 3 and not is_cons(nsw): return 3 elif (((nsw.startswith("O'") or nsw.startswith("D'")) and nsw[2:].istitle()) or (nsw.endswith("s'") and nsw[:-2].istitle())): return 3 elif nsw in element_dict: return 1 elif (not (nsw.isupper() or nsw.endswith('s') and nsw[:-1].isupper()) and (nsw.lower() in wordlist or (nsw[:-1].lower() in wordlist and nsw.endswith('s'))) and nsw not in ampm): return 3 elif triple_rep(nsw) and len(nsw) > 3: return 3 elif nsw.isalpha() and nsw.islower() and len(nsw) > 3: return 3 elif acr_pattern.match(nsw) and nsw not in meas_dict: return 2 elif len(nsw) == 1: return 2 elif nsw.isalpha and nsw.islower() and len(nsw) > 2: return 3 else: return -1
def retagify(dic, verbose=True): """ Retag each part of a SPLT token as 'SPLT-ALPHA', 'SPLT-NUMB' or 'SPLT-MISC'. """ out = {} for ind, (it, tag) in dic.items(): if verbose: sys.stdout.write("\r{} of {} retagged".format(len(out), len(dic))) sys.stdout.flush() if len(it) > 100: out.update({ind: (it, tag + 'MISC')}) if is_digbased(it): out.update({ind: (it, tag + 'NUMB')}) elif (only_alpha(it) and (not mixedcase_pattern.match(it) or len(it) <= 3 or (it[-1] == 's' and not mixedcase_pattern.match(it[:-1])))): out.update({ind: (it, tag + 'ALPHA')}) elif it in meas_dict: out.update({ind: (it, tag + 'ALPHA')}) else: out.update({ind: (it, tag + 'MISC')}) if verbose: sys.stdout.write("\r{} of {} retagged".format(len(out), len(dic))) sys.stdout.flush() print("\n") return out
def mixedalnum_split(nsw): """ Split tokens on transitions from letters to numbers or numbers to letters. """ try: out = [] ind = 0 if nsw[0] in digits: cat = 'num' elif nsw[0].isalpha: cat = 'let' else: cat = 'punc' for i in range(1, len(nsw)): if nsw[i] in digits: if cat == 'num' or cat == 'punc': pass else: out.append(nsw[ind:i]) cat = 'num' ind = i elif nsw[i].isalpha(): if cat == 'let' or cat == 'punc': pass else: out.append(nsw[ind:i]) cat = 'let' ind = i elif nsw[i] == '°' and cat == 'num' and nsw[i + 1:] in [ 'C', 'F', 'K', 'Re' ]: out.append(nsw[ind:i]) ind = i break else: pass out.append(nsw[ind:]) if len(out) == 3 and is_digbased( out[0]) and out[2].isdigit() and out[1] + out[2] in meas_dict: out = [out[0], out[1] + out[2]] return out except (KeyboardInterrupt, SystemExit): raise except: return nsw
def expand_EXPN(nsw, i, text, user_abbrevs={}): """Expand abbreviations to best possible match. If no close matches, return nsw.""" try: if user_abbrevs: abbrevs = create_user_abbrevs(user_abbrevs) else: abbrevs = abbrevs_orig if nsw in ['St.', 'st.', 'St']: if i < len(text): if text[i + 1].lower() in names_lower: return 'Saint' elif text[i + 1].endswith("'s"): if text[i + 1][:-2].lower() in names_lower: return 'Saint' elif text[i - 1].istitle(): return 'street' elif text[i + 1].istitle(): return 'Saint' elif nsw in meas_dict: if isinstance(i, int): if is_digbased(text[i - 1]): if text[i - 1] == '1': return meas_dict[nsw] else: return meas_dict_pl[nsw] else: full = text[int(i)] index = full.find(nsw) if index == 0: if is_digbased(text[int(i) - 1]): if text[int(i) - 1] == '1': return meas_dict[nsw] else: return meas_dict_pl[nsw] else: if is_digbased(full[:index]): if text[int(i) - 1] == '1': return meas_dict[nsw] else: return meas_dict_pl[nsw] elif (nsw.endswith('.') and nsw[:-1] in meas_dict and is_digbased(text[i - 1])): if text[i - 1] == '1': return meas_dict[nsw[:-1]] else: return meas_dict_pl[nsw[:-1]] if nsw.endswith('.') and nsw[:-1].lower() in abbrevs: w = nsw[:-1] else: w = nsw if w.lower() in abbrevs: cands = abbrevs[w.lower()] true_tag = abbrev_tag(i, text) true_tag_univ = abbrev_tag_univ(i, text) if len(cands) == 1: cand = cands[0] if pos_tag_dict_univ[cand.lower()] in [true_tag_univ, tuple()]: return cand matches = [] for cand in cands: if true_tag in pos_tag_dict[cand.lower()]: matches += [cand] if not matches: for cand in cands: if true_tag_univ in pos_tag_dict_univ[cand.lower()]: matches += [cand] if matches: best = 0 current = [] if len(matches) == 1: return matches[0] for cand in matches: olap = overlap(i, cand, text) if olap > best and cand in brown_common: best = olap current = [cand] elif olap == best and best != 0: current.append(cand) elif cand in states.values() and not current: current.append(cand) best = 0 exp = '' for c in current: if c in states.values(): return c elif c in brown_common: freq = brown_common[c] else: freq = 0 if freq < best: best = freq exp = c else: exp = maximum_overlap(w, i, text) elif w.lower().endswith('s.') and w.lower()[:-2] in abbrevs: return expand_EXPN(w.lower()[:-2], i, text) + 's' elif w.lower().endswith('s') and w.lower()[:-1] in abbrevs: return expand_EXPN(w.lower()[:-1], i, text) + 's' else: exp = maximum_overlap(w, i, text) if exp == '': return nsw else: return exp except (KeyboardInterrupt, SystemExit): raise except LookupError: raise except: return nsw