Пример #1
0
def abbrev_tag_univ(i, text):
    """Return POS tag for the abbreviation using universal tagset."""
    for (cand, tag) in tag_sent_univ(i, text):
        if isinstance(i, int):
            if text[i] == cand:
                return tag
        else:
            if split({int(i): (text[int(i)], 'SPLT')},
                     verbose=False)[i][0] == cand:
                return tag
Пример #2
0
def gen_context(i, text):
    """Generate context for the abbreviation - 4 words either side unless
       sentence is too short."""
    ind = i
    context = []
    text = text[:]
    if not isinstance(i, int):
        ind = int(i)
        split_token = text[ind]
        del text[ind]
        parts = split({ind: (split_token, 'SPLT')}, verbose=False)
        for it in sorted(parts, reverse=True):
            text.insert(ind, parts[it][0])
    start = ind
    end = ind + 1
    sloop = True
    while sloop and start > 0:
        if text[start - 1] not in ['.', '!', '?']:
            start -= 1
        else:
            sloop = False
    eloop = True
    while eloop and end <= len(text) - 1:
        if text[end] in ['.', '!', '?']:
            eloop = False
        else:
            end += 1
    if ind - start < 4:
        if end - start >= 9:
            context += text[start:start + 9]
        else:
            context += text[start:end]
    elif end - ind < 5:
        if end - start >= 9:
            context += text[end - 9:end]
        else:
            context += text[start:end]
    else:
        context += text[ind - 4:ind + 5]
    return context
Пример #3
0
def tag_matches(i, text):
    """Return candidate expansions whose POS tag matches the POS tag of the
       abbreviation."""
    matches = []
    if isinstance(i, int):
        abbrev = text[i]
    else:
        abbrev = split({int(i): (text[int(i)], 'SPLT')}, verbose=False)[i][0]
    true_tag = abbrev_tag(i, text)
    for (cand, tags) in tag_cands(abbrev):
        if true_tag in tags:
            matches += [cand]
    if not matches:
        true_tag_univ = abbrev_tag_univ(i, text)
        for (cand, tags) in tag_cands_univ(abbrev):
            if true_tag_univ in tags:
                matches += [cand]
    if not matches and len(tag_cands_univ(abbrev)) == 1:
        if tag_cands_univ(abbrev)[0][1] == tuple():
            return [tag_cands_univ(abbrev)[0][0]]
    if len(matches) <= 10:
        return matches
    else:
        return matches[:10]
Пример #4
0
if __name__ == "__main__":
    # Store all NUMB tags from training data in NUMB_list, including SPLT-NUMB.
    tagged = tagify(NSWs, verbose=False)

    NUMB_dict = {
        ind: (nsw, tag)
        for ind, (nsw, tag) in tagged.items() if tag == 'NUMB'
    }

    SPLT_dict = {
        ind: (nsw, tag)
        for ind, (nsw, tag) in tagged.items() if tag == 'SPLT'
    }

    splitted = split(SPLT_dict, verbose=False)
    retagged = retagify(splitted, verbose=False)
    retagged_NUMB_dict = {
        ind: (nsw, tag)
        for ind, (nsw, tag) in retagged.items() if tag == 'SPLT-NUMB'
    }
    NUMB_dict.update(retagged_NUMB_dict)

curr_list = ['£', '$', '€', 'Y']
ampm = ['am', 'pm', 'AM', 'PM', 'a.m.', 'p.m.', 'A.M.', 'P.M.', 'pm.', 'am.']
months = [
    "January", "Jan", "Jan.", "February", "Feb", "Feb.", "March", "Mar",
    "Mar.", "April", "Apr", "Apr.", "May", "June", "Jun", "Jun.", "July",
    "Jul", "Jul.", "August", "Aug", "Aug.", "September", "Sept", "Sept.",
    "October", "Oct", "Oct.", "November", "Nov", "Nov.", "December", "Dec",
    "Dec."
Пример #5
0
def list_NSWs(text, verbose=True, variety='BrE', user_abbrevs={}):
    if verbose:
        print("\nCREATING NSW DICTIONARY")
        print("-----------------------\n")

    NSWs = create_NSW_dict(text, verbose=verbose)
    if verbose:
        print("{} NSWs found\n".format(len(NSWs)))
        print("TAGGING NSWs")
        print("------------\n")
    tagged = tagify(NSWs, verbose=verbose)
    ALPHA_dict = {}
    NUMB_dict = {}
    MISC_dict = {}
    SPLT_dict = {}
    for item in tagged.items():
        tag = item[1][1]
        if tag == 'ALPHA':
            ALPHA_dict.update((item, ))
        elif tag == 'NUMB':
            NUMB_dict.update((item, ))
        elif tag == 'MISC':
            MISC_dict.update((item, ))
        elif tag == 'SPLT':
            SPLT_dict.update((item, ))
    if verbose:
        print("SPLITTING NSWs")
        print("--------------\n")
    splitted = split(SPLT_dict, verbose=verbose)
    if verbose:
        print("RETAGGING SPLIT NSWs")
        print("--------------------\n")
    retagged = retagify(splitted, verbose=verbose)
    for item in retagged.items():
        tag = item[1][1]
        if tag == 'SPLT-ALPHA':
            ALPHA_dict.update((item, ))
        elif tag == 'SPLT-NUMB':
            NUMB_dict.update((item, ))
        elif tag == 'SPLT-MISC':
            MISC_dict.update((item, ))
    if verbose:
        print("CLASSIFYING ALPHABETIC NSWs")
        print("---------------------------\n")
    tagged_ALPHA = run_clfALPHA(ALPHA_dict,
                                text,
                                verbose=verbose,
                                user_abbrevs=user_abbrevs)
    if verbose:
        print("CLASSIFYING NUMERIC NSWs")
        print("------------------------\n")
    tagged_NUMB = run_clfNUMB(NUMB_dict, text, verbose=verbose)
    if verbose:
        print("CLASSIFYING MISCELLANEOUS NSWs")
        print("------------------------------\n")
    tagged_MISC = tag_MISC(MISC_dict, verbose=verbose)
    if verbose:
        print("EXPANDING ALPHABETIC NSWs")
        print("-------------------------\n")
    expanded_ALPHA = expand_all(tagged_ALPHA,
                                text,
                                verbose=verbose,
                                variety=variety,
                                user_abbrevs=user_abbrevs)
    if verbose:
        print("EXPANDING NUMERIC NSWs")
        print("----------------------\n")
    expanded_NUMB = expand_all(tagged_NUMB,
                               text,
                               verbose=verbose,
                               variety=variety,
                               user_abbrevs=user_abbrevs)
    if verbose:
        print("EXPANDING MISCELLANEOUS NSWs")
        print("----------------------------\n")
    expanded_MISC = expand_all(tagged_MISC,
                               text,
                               verbose=verbose,
                               variety=variety,
                               user_abbrevs=user_abbrevs)
    return expanded_ALPHA, expanded_NUMB, expanded_MISC