Exemplo n.º 1
0
def classify(token_list):
    out = dict()
    tokens = copy(token_list)
    
    # from pprint import pprint
    # pprint(FIELD_DMV)
    
    # first, sanity-check the digified tokens --
    # if more than one can pass for a phone number, a color,
    # or a zip code (that is to say, the input is ambiguous),
    # we bail:
    for token in tokens:
        # check each term against all test funcs --
        # if more than one bucket is nonzero, it's a problem
        h = Histogram()
        for FieldType in FIELD_DMV:
            if FieldType.check(token):
                h.inc(FieldType.json_name)
        if len(h) > 1:
            # ERROR: couldn't distinguish one thing
            # from another... BAIL
            raise RDAmbiguousTerms("Token '%s' parsed ambiguously\n"
                                   "Passed multiple field checks: %s" % (
                                       token, SEP_WS.join(h.iterkeys())
                                   ))
    
    # update `out` with the classified tokens
    for idx, token in enumerate(copy(tokens)):
        for FieldType in FIELD_DMV:
            if FieldType.check(token):
                field = FieldType()
                out.update(field.value_for_token(token))
                tokens.remove(token)
            continue
    
    # raise appropriate errors when we don't find what we need
    for FieldType in FIELD_DMV:
        field = FieldType()
        if not out.has_key(field.name):
            field.unfound(token_list) # this may raise
    
    # what is left "should" be the pieces of the name,
    # e.g. ['Washington', 'Booker T.'], ['James Murphy'], &c
    first_field = FirstNameField()
    last_field = LastNameField()
    if len(tokens) > 2:
        # ERROR: wtf is going on
        pass
    elif len(tokens) == 2:
        out.update(first_field.value_for_token(tokens[-1]))
        out.update(last_field.value_for_token(tokens[0]))
    elif len(tokens) == 1:
        names = tokens[0].split()
        if len(names) > 1:
            out.update(first_field.value_for_token(names[0]))
            out.update(last_field.value_for_token(names[-1]))
        else:
            NameField().unfound(token_list)
    else:
        # WHY ARE WE HERE. No names... really??
        raise RDAmbiguousNames("No names present!"
                               "Reconstructed original line:\n"
                               "\t%s" % reconstruct(token_list))
    
    # pprint(out, indent=4)
    return out