def abbrev_tag_univ(i, text): """Return POS tag for the abbreviation using universal tagset.""" for (cand, tag) in tag_sent_univ(i, text): if isinstance(i, int): if text[i] == cand: return tag else: if split({int(i): (text[int(i)], 'SPLT')}, verbose=False)[i][0] == cand: return tag
def gen_context(i, text): """Generate context for the abbreviation - 4 words either side unless sentence is too short.""" ind = i context = [] text = text[:] if not isinstance(i, int): ind = int(i) split_token = text[ind] del text[ind] parts = split({ind: (split_token, 'SPLT')}, verbose=False) for it in sorted(parts, reverse=True): text.insert(ind, parts[it][0]) start = ind end = ind + 1 sloop = True while sloop and start > 0: if text[start - 1] not in ['.', '!', '?']: start -= 1 else: sloop = False eloop = True while eloop and end <= len(text) - 1: if text[end] in ['.', '!', '?']: eloop = False else: end += 1 if ind - start < 4: if end - start >= 9: context += text[start:start + 9] else: context += text[start:end] elif end - ind < 5: if end - start >= 9: context += text[end - 9:end] else: context += text[start:end] else: context += text[ind - 4:ind + 5] return context
def tag_matches(i, text): """Return candidate expansions whose POS tag matches the POS tag of the abbreviation.""" matches = [] if isinstance(i, int): abbrev = text[i] else: abbrev = split({int(i): (text[int(i)], 'SPLT')}, verbose=False)[i][0] true_tag = abbrev_tag(i, text) for (cand, tags) in tag_cands(abbrev): if true_tag in tags: matches += [cand] if not matches: true_tag_univ = abbrev_tag_univ(i, text) for (cand, tags) in tag_cands_univ(abbrev): if true_tag_univ in tags: matches += [cand] if not matches and len(tag_cands_univ(abbrev)) == 1: if tag_cands_univ(abbrev)[0][1] == tuple(): return [tag_cands_univ(abbrev)[0][0]] if len(matches) <= 10: return matches else: return matches[:10]
if __name__ == "__main__": # Store all NUMB tags from training data in NUMB_list, including SPLT-NUMB. tagged = tagify(NSWs, verbose=False) NUMB_dict = { ind: (nsw, tag) for ind, (nsw, tag) in tagged.items() if tag == 'NUMB' } SPLT_dict = { ind: (nsw, tag) for ind, (nsw, tag) in tagged.items() if tag == 'SPLT' } splitted = split(SPLT_dict, verbose=False) retagged = retagify(splitted, verbose=False) retagged_NUMB_dict = { ind: (nsw, tag) for ind, (nsw, tag) in retagged.items() if tag == 'SPLT-NUMB' } NUMB_dict.update(retagged_NUMB_dict) curr_list = ['£', '$', '€', 'Y'] ampm = ['am', 'pm', 'AM', 'PM', 'a.m.', 'p.m.', 'A.M.', 'P.M.', 'pm.', 'am.'] months = [ "January", "Jan", "Jan.", "February", "Feb", "Feb.", "March", "Mar", "Mar.", "April", "Apr", "Apr.", "May", "June", "Jun", "Jun.", "July", "Jul", "Jul.", "August", "Aug", "Aug.", "September", "Sept", "Sept.", "October", "Oct", "Oct.", "November", "Nov", "Nov.", "December", "Dec", "Dec."
def list_NSWs(text, verbose=True, variety='BrE', user_abbrevs={}): if verbose: print("\nCREATING NSW DICTIONARY") print("-----------------------\n") NSWs = create_NSW_dict(text, verbose=verbose) if verbose: print("{} NSWs found\n".format(len(NSWs))) print("TAGGING NSWs") print("------------\n") tagged = tagify(NSWs, verbose=verbose) ALPHA_dict = {} NUMB_dict = {} MISC_dict = {} SPLT_dict = {} for item in tagged.items(): tag = item[1][1] if tag == 'ALPHA': ALPHA_dict.update((item, )) elif tag == 'NUMB': NUMB_dict.update((item, )) elif tag == 'MISC': MISC_dict.update((item, )) elif tag == 'SPLT': SPLT_dict.update((item, )) if verbose: print("SPLITTING NSWs") print("--------------\n") splitted = split(SPLT_dict, verbose=verbose) if verbose: print("RETAGGING SPLIT NSWs") print("--------------------\n") retagged = retagify(splitted, verbose=verbose) for item in retagged.items(): tag = item[1][1] if tag == 'SPLT-ALPHA': ALPHA_dict.update((item, )) elif tag == 'SPLT-NUMB': NUMB_dict.update((item, )) elif tag == 'SPLT-MISC': MISC_dict.update((item, )) if verbose: print("CLASSIFYING ALPHABETIC NSWs") print("---------------------------\n") tagged_ALPHA = run_clfALPHA(ALPHA_dict, text, verbose=verbose, user_abbrevs=user_abbrevs) if verbose: print("CLASSIFYING NUMERIC NSWs") print("------------------------\n") tagged_NUMB = run_clfNUMB(NUMB_dict, text, verbose=verbose) if verbose: print("CLASSIFYING MISCELLANEOUS NSWs") print("------------------------------\n") tagged_MISC = tag_MISC(MISC_dict, verbose=verbose) if verbose: print("EXPANDING ALPHABETIC NSWs") print("-------------------------\n") expanded_ALPHA = expand_all(tagged_ALPHA, text, verbose=verbose, variety=variety, user_abbrevs=user_abbrevs) if verbose: print("EXPANDING NUMERIC NSWs") print("----------------------\n") expanded_NUMB = expand_all(tagged_NUMB, text, verbose=verbose, variety=variety, user_abbrevs=user_abbrevs) if verbose: print("EXPANDING MISCELLANEOUS NSWs") print("----------------------------\n") expanded_MISC = expand_all(tagged_MISC, text, verbose=verbose, variety=variety, user_abbrevs=user_abbrevs) return expanded_ALPHA, expanded_NUMB, expanded_MISC