def test_file_read(self): from os.path import join, dirname from rolodexer.histogram import Histogram entries = [] errors = [] colors = Histogram() inpth = join(dirname(dirname(__file__)), 'data', 'data.in') with open(inpth, 'rb') as fh: idx = 0 while True: linen = fh.readline() if not linen: break line = linen.strip() tokens = rolodexer.tokenize(line) try: terms = rolodexer.classify(tokens) except rolodexer.RolodexerError: errors.append(idx) else: entries.append(terms) colors.inc(terms.get('color', 'CLEAR')) idx += 1 output_dict = { u"entries": entries, u"errors": errors } output_json = json.dumps(output_dict, indent=2, sort_keys=True) print(output_json) print(colors) # all classified lines have colors: self.assertEquals(colors.min(), 3) self.assertEquals(colors.max(), 10) self.assertEquals(colors.val('CLEAR'), 0)
def cli(argv=None): if not argv: argv = sys.argv arguments = docopt(__doc__, argv=argv[1:], help=True, version='0.1.3') # print(argv) # print(arguments) # sys.exit() entries = [] errors = [] colors = Histogram() ipth = arguments.get('INFILE') opth = arguments.get('--output') verbose = bool(arguments.get('--verbose')) with open(ipth, 'rb') as fh: idx = 0 while True: linen = fh.readline() if not linen: break line = linen.strip() tokens = rolodexer.tokenize(line) try: terms = rolodexer.classify(tokens) except rolodexer.RolodexerError: errors.append(idx) else: entries.append(terms) if 'color' in terms: colors.inc(terms.get('color')) idx += 1 output_dict = { u"entries": entries, u"errors": errors } if verbose: print("Entries parsed: %s" % len(entries), file=sys.stderr) print("Errors encountered: %s" % len(errors), file=sys.stderr) print_colors(colors) if opth == 'stdout': output_json = json.dumps(output_dict, **JSON_ARGS) print(output_json, file=sys.stdout) elif not exists(opth) and isdir(dirname(opth)): if verbose: print("rolodexer: saving output to %s" % opth, file=sys.stderr) with open(opth, 'wb') as fp: json.dump(output_dict, fp, **JSON_ARGS)
def classify(orig_terms): out = dict() terms = copy(orig_terms) # first, sanity-check the digified terms -- # if more than one can pass for a phone number, a color, # or a zip code (that is to say, the input is ambiguous), # we bail: for term in terms: # check each term against all test funcs -- # if more than one bucket is nonzero, it's a problem h = Histogram() if is_zip(term): h.inc('zip') if is_phone(term): h.inc('phone') if is_color(term): h.inc('color') if len(h) > 1: # ERROR: couldn't distinguish one thing # from another... BAIL raise RDAmbiguousTerms("Term '%s' parsed ambiguously\n" "Passed multiple tests: %s" % ( term, SEP_WS.join(h.iterkeys()) )) # next, recurse and grab the phone number and color # ... they are the easiest to find: for idx, term in enumerate(copy(terms)): # tref = terms[idx] # I do miss C++ sometimes if is_phone(term): out.update({ u'phonenumber': u"%s" % phone_format(term) }) terms.remove(term) continue elif is_color(term): out.update({ u'color': u"%s" % term }) terms.remove(term) continue elif is_zip(term): out.update({ u'zipcode': u"%s" % term }) terms.remove(term) continue if not out.has_key(u'phonenumber'): # ERROR: NO PHONE / BAD PHONE! raise RDPhoneNumberError("No valid phone number in %d-term list\n" "Reconstructed original line:\n" "\t%s" % (len(terms), reconstruct(orig_terms))) if not out.has_key(u'zipcode'): # ERROR: NO ZIPCODE / BAD ZIPCODE! raise RDZipCodeError("No valid zip code in %d-term list\n" "Reconstructed original line:\n" "\t%s" % (len(terms), reconstruct(orig_terms))) if not out.has_key(u'color'): # LESS DISCONCERTING ERROR: NO COLOR / BAD COLOR! pass # what is left "should" be the pieces of the name, # e.g. ['Washington', 'Booker T.'], ['James Murphy'], &c if len(terms) > 2: # ERROR: wtf is going on pass elif len(terms) == 2: out.update({ u'firstname': u"%s" % terms[-1], u'lastname': u"%s" % terms[0] }) elif len(terms) == 1: names = terms[0].split() if len(names) > 1: out.update({ u'firstname': u"%s" % names[0], u'lastname': u"%s" % names[-1] }) else: # ERROR: only one name -- `raise MadonnaError()` ? # ... use it as the *last* name for now, maybe # ... naw, f that: ERROR. raise RDAmbiguousNames("Only one name present: '%s'\n" "Reconstructed original line:\n" "\t%s" % (names.pop(), reconstruct(orig_terms))) else: # WHY ARE WE HERE. No names... really?? raise RDAmbiguousNames("No names present!" "Reconstructed original line:\n" "\t%s" % reconstruct(orig_terms)) # pprint(out, indent=4) return out