def test_paren(self): assert tokenize("222 W Merchandise Mart Plaza (1871) Chicago IL 60654") == [ "222", "W", "Merchandise", "Mart", "Plaza", "(1871)", "Chicago", "IL", "60654", ] assert tokenize("222 W Merchandise Mart Plaza (1871), Chicago IL 60654") == [ "222", "W", "Merchandise", "Mart", "Plaza", "(1871),", "Chicago", "IL", "60654", ] assert tokenize("222 W Merchandise Mart Plaza(1871) Chicago IL 60654") == [ "222", "W", "Merchandise", "Mart", "Plaza", "(1871)", "Chicago", "IL", "60654", ]
def test_split_on_punc(self): self.assertEqual(tokenize('1 abc st,suite 1'), ['1', 'abc', 'st,', 'suite', '1']) self.assertEqual(tokenize('1 abc st;suite 1'), ['1', 'abc', 'st;', 'suite', '1']) self.assertEqual( tokenize('1-5 abc road'), ['1-5', 'abc', 'road'], )
def test_capture_punc(self): assert tokenize('222 W. Merchandise Mart Plaza') == [ '222', 'W.', 'Merchandise', 'Mart', 'Plaza' ] assert tokenize('222 W Merchandise Mart Plaza, Chicago, IL') == [ '222', 'W', 'Merchandise', 'Mart', 'Plaza,', 'Chicago,', 'IL' ] assert tokenize('123 Monroe- St') == ['123', 'Monroe-', 'St']
def test_split_on_punc(self): assert tokenize('1 abc st,suite 1') == [ '1', 'abc', 'st,', 'suite', '1' ] assert tokenize('1 abc st;suite 1') == [ '1', 'abc', 'st;', 'suite', '1' ] assert tokenize('1-5 abc road') == ['1-5', 'abc', 'road']
def test_spaces(self): self.assertEqual(tokenize('1 abc st'), ['1', 'abc', 'st']) self.assertEqual( tokenize('1 abc st'), ['1', 'abc', 'st'], ) self.assertEqual(tokenize('1 abc st '), ['1', 'abc', 'st']) self.assertEqual( tokenize(' 1 abc st'), ['1', 'abc', 'st'], )
def test_capture_punc(self): assert tokenize("222 W. Merchandise Mart Plaza") == ["222", "W.", "Merchandise", "Mart", "Plaza"] assert tokenize("222 W Merchandise Mart Plaza, Chicago, IL") == [ "222", "W", "Merchandise", "Mart", "Plaza,", "Chicago,", "IL", ] assert tokenize("123 Monroe- St") == ["123", "Monroe-", "St"]
def test_capture_punc(self): self.assertEqual( tokenize('222 W. Merchandise Mart Plaza'), ['222', 'W.', 'Merchandise', 'Mart', 'Plaza'], ) self.assertEqual( tokenize('222 W Merchandise Mart Plaza, Chicago, IL'), ['222', 'W', 'Merchandise', 'Mart', 'Plaza,', 'Chicago,', 'IL'], ) self.assertEqual( tokenize('123 Monroe- St'), ['123', 'Monroe-', 'St'] )
def test_split_on_punc(self): self.assertEqual( tokenize('1 abc st,suite 1'), ['1', 'abc', 'st,', 'suite', '1'] ) self.assertEqual( tokenize('1 abc st;suite 1'), ['1', 'abc', 'st;', 'suite', '1'] ) self.assertEqual( tokenize('1-5 abc road'), ['1-5', 'abc', 'road'], )
def test_paren(self): assert tokenize( '222 W Merchandise Mart Plaza (1871) Chicago IL 60654') == [ '222', 'W', 'Merchandise', 'Mart', 'Plaza', '(1871)', 'Chicago', 'IL', '60654' ] assert tokenize( '222 W Merchandise Mart Plaza (1871), Chicago IL 60654') == [ '222', 'W', 'Merchandise', 'Mart', 'Plaza', '(1871),', 'Chicago', 'IL', '60654' ] assert tokenize( '222 W Merchandise Mart Plaza(1871) Chicago IL 60654') == [ '222', 'W', 'Merchandise', 'Mart', 'Plaza', '(1871)', 'Chicago', 'IL', '60654' ]
def test_paren(self): self.assertEqual( tokenize('222 W Merchandise Mart Plaza (1871) Chicago IL 60654'), ['222', 'W', 'Merchandise', 'Mart', 'Plaza', '(1871)', 'Chicago', 'IL', '60654'], ) self.assertEqual( tokenize('222 W Merchandise Mart Plaza (1871), Chicago IL 60654'), ['222', 'W', 'Merchandise', 'Mart', 'Plaza', '(1871),', 'Chicago', 'IL', '60654'] ) self.assertEqual( tokenize('222 W Merchandise Mart Plaza(1871) Chicago IL 60654'), ['222', 'W', 'Merchandise', 'Mart', 'Plaza', '(1871)', 'Chicago', 'IL', '60654'] )
def test_spaces(self): self.assertEqual( tokenize('1 abc st'), ['1', 'abc', 'st'] ) self.assertEqual( tokenize('1 abc st'), ['1', 'abc', 'st'], ) self.assertEqual( tokenize('1 abc st '), ['1', 'abc', 'st'] ) self.assertEqual( tokenize(' 1 abc st'), ['1', 'abc', 'st'], )
def fit(self, X, y, **params): # sklearn requires parameters to be declared as fields of the estimator, # an we can't have a full stop there. Replace with an underscore params = {k.replace('_', '.'): v for k, v in self.__dict__.items()} trainer = pycrfsuite.Trainer(verbose=False, params=params) for address, labels in zip(X, y): tokens = usaddress.tokenize(address) trainer.append(usaddress.addr2features(tokens), labels) trainer.train(self.model_path) reload(usaddress)
def test_paren(self): self.assertEqual( tokenize('222 W Merchandise Mart Plaza (1871) Chicago IL 60654'), [ '222', 'W', 'Merchandise', 'Mart', 'Plaza', '(1871)', 'Chicago', 'IL', '60654' ], ) self.assertEqual( tokenize('222 W Merchandise Mart Plaza (1871), Chicago IL 60654'), [ '222', 'W', 'Merchandise', 'Mart', 'Plaza', '(1871),', 'Chicago', 'IL', '60654' ]) self.assertEqual( tokenize('222 W Merchandise Mart Plaza(1871) Chicago IL 60654'), [ '222', 'W', 'Merchandise', 'Mart', 'Plaza', '(1871)', 'Chicago', 'IL', '60654' ])
def test_nums(self): assert tokenize("222 W Merchandise Mart Plaza Chicago IL 60654") == [ "222", "W", "Merchandise", "Mart", "Plaza", "Chicago", "IL", "60654", ]
def test_hash(self): self.assertEqual( tokenize('# 1 abc st'), ['#', '1', 'abc', 'st'] ) self.assertEqual( tokenize('#1 abc st'), ['#', '1', 'abc', 'st'] ) self.assertEqual( tokenize('box # 1 abc st'), ['box', '#', '1', 'abc', 'st'] ) self.assertEqual( tokenize('box #1 abc st'), ['box', '#', '1', 'abc', 'st'] ) self.assertEqual( tokenize('box# 1 abc st'), ['box', '#', '1', 'abc', 'st'], ) self.assertEqual( tokenize('box#1 abc st'), ['box', '#', '1', 'abc', 'st'] )
def test_hash(self): assert tokenize('# 1 abc st') == ['#', '1', 'abc', 'st'] assert tokenize('#1 abc st') == ['#', '1', 'abc', 'st'] assert tokenize('box # 1 abc st') == ['box', '#', '1', 'abc', 'st'] assert tokenize('box #1 abc st') == ['box', '#', '1', 'abc', 'st'] assert tokenize('box# 1 abc st') == ['box', '#', '1', 'abc', 'st'] assert tokenize('box#1 abc st') == ['box', '#', '1', 'abc', 'st']
def test_hash(self): assert tokenize("# 1 abc st") == ["#", "1", "abc", "st"] assert tokenize("#1 abc st") == ["#", "1", "abc", "st"] assert tokenize("box # 1 abc st") == ["box", "#", "1", "abc", "st"] assert tokenize("box #1 abc st") == ["box", "#", "1", "abc", "st"] assert tokenize("box# 1 abc st") == ["box", "#", "1", "abc", "st"] assert tokenize("box#1 abc st") == ["box", "#", "1", "abc", "st"]
def fit(self, X, y, **params): # sklearn requires parameters to be declared as fields of the estimator, # an we can't have a full stop there. Replace with an underscore params = {k.replace('_', '.'): v for k, v in self.__dict__.items()} trainer = pycrfsuite.Trainer(verbose=False, params=params) for address, labels in zip(X, y): tokens = usaddress.tokenize(address) if len(tokens) != len(labels): # sometimes there are more/less gold standard labels than # the number of tags the system will output, This is because # our tokenizer works differently to how the data is tokenized. # Let's pretend this never happened continue trainer.append(usaddress.addr2features(tokens), labels) trainer.train(self.model_path) reload(usaddress)
def list2xml(addr_list, outfile): xml_addr_list = etree.Element('AddressCollection') for addr in addr_list: xml_addr = etree.Element('AddressString') # handle commas? for component in addr: if component[1]: for token in tokenize(component[1]): token_xml = etree.Element(component[0]) token_xml.text = token token_xml.tail = ' ' xml_addr.append(token_xml) xml_addr[-1].tail = None xml_addr_list.append(xml_addr) output = etree.tostring(xml_addr_list, pretty_print=True) with open(outfile, 'w') as f: f.write(output)
def test_hash(self): self.assertEqual(tokenize('# 1 abc st'), ['#', '1', 'abc', 'st']) self.assertEqual(tokenize('#1 abc st'), ['#', '1', 'abc', 'st']) self.assertEqual(tokenize('box # 1 abc st'), ['box', '#', '1', 'abc', 'st']) self.assertEqual(tokenize('box #1 abc st'), ['box', '#', '1', 'abc', 'st']) self.assertEqual( tokenize('box# 1 abc st'), ['box', '#', '1', 'abc', 'st'], ) self.assertEqual(tokenize('box#1 abc st'), ['box', '#', '1', 'abc', 'st'])
def test_spaces(self): assert tokenize("1 abc st") == ["1", "abc", "st"] assert tokenize("1 abc st") == ["1", "abc", "st"] assert tokenize("1 abc st ") == ["1", "abc", "st"] assert tokenize(" 1 abc st") == ["1", "abc", "st"]
def test_ampersand(self): assert tokenize("123 & 456") == ["123", "&", "456"] assert tokenize("123&456") == ["123", "&", "456"] assert tokenize("123& 456") == ["123", "&", "456"] assert tokenize("123 &456") == ["123", "&", "456"] assert tokenize("123 & 456") == ["123", "&", "456"] assert tokenize("123&456") == ["123", "&", "456"] assert tokenize("123& 456") == ["123", "&", "456"] assert tokenize("123 &456") == ["123", "&", "456"] assert tokenize("123 & 456") == ["123", "&", "456"] assert tokenize("123&456") == ["123", "&", "456"] assert tokenize("123& 456") == ["123", "&", "456"] assert tokenize("123 &456") == ["123", "&", "456"]
def test_split_on_punc(self) : assert tokenize('1 abc st,suite 1') == ['1', 'abc', 'st,', 'suite', '1'] assert tokenize('1 abc st;suite 1') == ['1', 'abc', 'st;', 'suite', '1'] assert tokenize('1-5 abc road') == ['1-5', 'abc', 'road']
def test_spaces(self) : assert tokenize('1 abc st') == ['1', 'abc', 'st'] assert tokenize('1 abc st') == ['1', 'abc', 'st'] assert tokenize('1 abc st ') == ['1', 'abc', 'st'] assert tokenize(' 1 abc st') == ['1', 'abc', 'st']
def test_nums(self): self.assertEqual( tokenize('222 W Merchandise Mart Plaza Chicago IL 60654'), ['222', 'W', 'Merchandise', 'Mart', 'Plaza', 'Chicago', 'IL', '60654'] )
def test_nums(self) : assert tokenize('222 W Merchandise Mart Plaza Chicago IL 60654') == ['222', 'W', 'Merchandise', 'Mart', 'Plaza', 'Chicago', 'IL', '60654' ]
def test_spaces(self): assert tokenize('1 abc st') == ['1', 'abc', 'st'] assert tokenize('1 abc st') == ['1', 'abc', 'st'] assert tokenize('1 abc st ') == ['1', 'abc', 'st'] assert tokenize(' 1 abc st') == ['1', 'abc', 'st']
def test_ampersand(self) : assert tokenize('123 & 456') == ['123', '&', '456']
def test_paren(self) : assert tokenize('222 W Merchandise Mart Plaza (1871) Chicago IL 60654') == ['222', 'W', 'Merchandise', 'Mart', 'Plaza', '(1871)', 'Chicago', 'IL', '60654' ] assert tokenize('222 W Merchandise Mart Plaza (1871), Chicago IL 60654') == ['222', 'W', 'Merchandise', 'Mart', 'Plaza', '(1871),', 'Chicago', 'IL', '60654' ] assert tokenize('222 W Merchandise Mart Plaza(1871) Chicago IL 60654') == ['222', 'W', 'Merchandise', 'Mart', 'Plaza', '(1871)', 'Chicago', 'IL', '60654' ]
def test_ampersand(self): self.assertEqual(tokenize('123 & 456'), ['123', '&', '456']) self.assertEqual(tokenize('123&456'), ['123', '&', '456']) self.assertEqual(tokenize('123& 456'), ['123', '&', '456']) self.assertEqual(tokenize('123 &456'), ['123', '&', '456']) self.assertEqual(tokenize('123 & 456'), ['123', '&', '456']) self.assertEqual(tokenize('123&456'), ['123', '&', '456']) self.assertEqual(tokenize('123& 456'), ['123', '&', '456']) self.assertEqual(tokenize('123 &456'), ['123', '&', '456']) self.assertEqual(tokenize('123 & 456'), ['123', '&', '456']) self.assertEqual(tokenize('123&456'), ['123', '&', '456']) self.assertEqual(tokenize('123& 456'), ['123', '&', '456']) self.assertEqual(tokenize('123 &456'), ['123', '&', '456'])
def test_split_on_punc(self): assert tokenize("1 abc st,suite 1") == ["1", "abc", "st,", "suite", "1"] assert tokenize("1 abc st;suite 1") == ["1", "abc", "st;", "suite", "1"] assert tokenize("1-5 abc road") == ["1-5", "abc", "road"]
def test_nums(self): assert tokenize('222 W Merchandise Mart Plaza Chicago IL 60654') == [ '222', 'W', 'Merchandise', 'Mart', 'Plaza', 'Chicago', 'IL', '60654' ]
def test_capture_punc(self) : assert tokenize('222 W. Merchandise Mart Plaza') == ['222', 'W.', 'Merchandise', 'Mart', 'Plaza'] assert tokenize('222 W Merchandise Mart Plaza, Chicago, IL') == ['222', 'W', 'Merchandise', 'Mart', 'Plaza,', 'Chicago,', 'IL' ]
def test_ampersand(self): assert tokenize('123 & 456') == ['123', '&', '456']
def test_ampersand(self): self.assertEqual( tokenize('123 & 456'), ['123', '&', '456'] ) self.assertEqual( tokenize('123&456'), ['123', '&', '456'] ) self.assertEqual( tokenize('123& 456'), ['123', '&', '456'] ) self.assertEqual( tokenize('123 &456'), ['123', '&', '456'] ) self.assertEqual( tokenize('123 & 456'), ['123', '&', '456'] ) self.assertEqual( tokenize('123&456'), ['123', '&', '456'] ) self.assertEqual( tokenize('123& 456'), ['123', '&', '456'] ) self.assertEqual( tokenize('123 &456'), ['123', '&', '456'] ) self.assertEqual( tokenize('123 & 456'), ['123', '&', '456'] ) self.assertEqual( tokenize('123&456'), ['123', '&', '456'] ) self.assertEqual( tokenize('123& 456'), ['123', '&', '456'] ) self.assertEqual( tokenize('123 &456'), ['123', '&', '456'] )
def test_nums(self): self.assertEqual( tokenize('222 W Merchandise Mart Plaza Chicago IL 60654'), [ '222', 'W', 'Merchandise', 'Mart', 'Plaza', 'Chicago', 'IL', '60654' ])