示例#1
0
 def test_paren(self):
     assert tokenize("222 W Merchandise Mart Plaza (1871) Chicago IL 60654") == [
         "222",
         "W",
         "Merchandise",
         "Mart",
         "Plaza",
         "(1871)",
         "Chicago",
         "IL",
         "60654",
     ]
     assert tokenize("222 W Merchandise Mart Plaza (1871), Chicago IL 60654") == [
         "222",
         "W",
         "Merchandise",
         "Mart",
         "Plaza",
         "(1871),",
         "Chicago",
         "IL",
         "60654",
     ]
     assert tokenize("222 W Merchandise Mart Plaza(1871) Chicago IL 60654") == [
         "222",
         "W",
         "Merchandise",
         "Mart",
         "Plaza",
         "(1871)",
         "Chicago",
         "IL",
         "60654",
     ]
示例#2
0
 def test_split_on_punc(self):
     self.assertEqual(tokenize('1 abc st,suite 1'),
                      ['1', 'abc', 'st,', 'suite', '1'])
     self.assertEqual(tokenize('1 abc st;suite 1'),
                      ['1', 'abc', 'st;', 'suite', '1'])
     self.assertEqual(
         tokenize('1-5 abc road'),
         ['1-5', 'abc', 'road'],
     )
示例#3
0
    def test_capture_punc(self):

        assert tokenize('222 W. Merchandise Mart Plaza') == [
            '222', 'W.', 'Merchandise', 'Mart', 'Plaza'
        ]
        assert tokenize('222 W Merchandise Mart Plaza, Chicago, IL') == [
            '222', 'W', 'Merchandise', 'Mart', 'Plaza,', 'Chicago,', 'IL'
        ]
        assert tokenize('123 Monroe- St') == ['123', 'Monroe-', 'St']
示例#4
0
    def test_split_on_punc(self):

        assert tokenize('1 abc st,suite 1') == [
            '1', 'abc', 'st,', 'suite', '1'
        ]
        assert tokenize('1 abc st;suite 1') == [
            '1', 'abc', 'st;', 'suite', '1'
        ]
        assert tokenize('1-5 abc road') == ['1-5', 'abc', 'road']
示例#5
0
 def test_spaces(self):
     self.assertEqual(tokenize('1 abc st'), ['1', 'abc', 'st'])
     self.assertEqual(
         tokenize('1  abc st'),
         ['1', 'abc', 'st'],
     )
     self.assertEqual(tokenize('1 abc st '), ['1', 'abc', 'st'])
     self.assertEqual(
         tokenize(' 1 abc st'),
         ['1', 'abc', 'st'],
     )
示例#6
0
    def test_capture_punc(self):

        assert tokenize("222 W. Merchandise Mart Plaza") == ["222", "W.", "Merchandise", "Mart", "Plaza"]
        assert tokenize("222 W Merchandise Mart Plaza, Chicago, IL") == [
            "222",
            "W",
            "Merchandise",
            "Mart",
            "Plaza,",
            "Chicago,",
            "IL",
        ]
        assert tokenize("123 Monroe- St") == ["123", "Monroe-", "St"]
示例#7
0
 def test_capture_punc(self):
     self.assertEqual(
         tokenize('222 W. Merchandise Mart Plaza'),
         ['222', 'W.', 'Merchandise', 'Mart', 'Plaza'],
     )
     self.assertEqual(
         tokenize('222 W Merchandise Mart Plaza, Chicago, IL'),
         ['222', 'W', 'Merchandise', 'Mart', 'Plaza,', 'Chicago,', 'IL'],
     )
     self.assertEqual(
         tokenize('123 Monroe- St'),
         ['123', 'Monroe-', 'St']
     )
示例#8
0
 def test_split_on_punc(self):
     self.assertEqual(
         tokenize('1 abc st,suite 1'),
         ['1', 'abc', 'st,', 'suite', '1']
     )
     self.assertEqual(
         tokenize('1 abc st;suite 1'),
         ['1', 'abc', 'st;', 'suite', '1']
     )
     self.assertEqual(
         tokenize('1-5 abc road'),
         ['1-5', 'abc', 'road'],
     )
示例#9
0
 def test_paren(self):
     assert tokenize(
         '222 W Merchandise Mart Plaza (1871) Chicago IL 60654') == [
             '222', 'W', 'Merchandise', 'Mart', 'Plaza', '(1871)',
             'Chicago', 'IL', '60654'
         ]
     assert tokenize(
         '222 W Merchandise Mart Plaza (1871), Chicago IL 60654') == [
             '222', 'W', 'Merchandise', 'Mart', 'Plaza', '(1871),',
             'Chicago', 'IL', '60654'
         ]
     assert tokenize(
         '222 W Merchandise Mart Plaza(1871) Chicago IL 60654') == [
             '222', 'W', 'Merchandise', 'Mart', 'Plaza', '(1871)',
             'Chicago', 'IL', '60654'
         ]
示例#10
0
 def test_paren(self):
     self.assertEqual(
         tokenize('222 W Merchandise Mart Plaza (1871) Chicago IL 60654'),
         ['222', 'W', 'Merchandise', 'Mart', 'Plaza', '(1871)', 'Chicago',
          'IL', '60654'],
     )
     self.assertEqual(
         tokenize('222 W Merchandise Mart Plaza (1871), Chicago IL 60654'),
         ['222', 'W', 'Merchandise', 'Mart', 'Plaza', '(1871),', 'Chicago',
          'IL', '60654']
     )
     self.assertEqual(
         tokenize('222 W Merchandise Mart Plaza(1871) Chicago IL 60654'),
         ['222', 'W', 'Merchandise', 'Mart', 'Plaza', '(1871)', 'Chicago',
          'IL', '60654']
     )
示例#11
0
 def test_spaces(self):
     self.assertEqual(
         tokenize('1 abc st'),
         ['1', 'abc', 'st']
     )
     self.assertEqual(
         tokenize('1  abc st'),
         ['1', 'abc', 'st'],
     )
     self.assertEqual(
         tokenize('1 abc st '),
         ['1', 'abc', 'st']
     )
     self.assertEqual(
         tokenize(' 1 abc st'),
         ['1', 'abc', 'st'],
     )
示例#12
0
 def fit(self, X, y, **params):
     # sklearn requires parameters to be declared as fields of the estimator,
     # an we can't have a full stop there. Replace with an underscore
     params = {k.replace('_', '.'): v for k, v in self.__dict__.items()}
     trainer = pycrfsuite.Trainer(verbose=False, params=params)
     for address, labels in zip(X, y):
         tokens = usaddress.tokenize(address)
         trainer.append(usaddress.addr2features(tokens), labels)
     trainer.train(self.model_path)
     reload(usaddress)
示例#13
0
文件: utils.py 项目: jhaus/usaddress
 def fit(self, X, y, **params):
     # sklearn requires parameters to be declared as fields of the estimator,
     # an we can't have a full stop there. Replace with an underscore
     params = {k.replace('_', '.'): v for k, v in self.__dict__.items()}
     trainer = pycrfsuite.Trainer(verbose=False, params=params)
     for address, labels in zip(X, y):
         tokens = usaddress.tokenize(address)
         trainer.append(usaddress.addr2features(tokens), labels)
     trainer.train(self.model_path)
     reload(usaddress)
示例#14
0
 def test_paren(self):
     self.assertEqual(
         tokenize('222 W Merchandise Mart Plaza (1871) Chicago IL 60654'),
         [
             '222', 'W', 'Merchandise', 'Mart', 'Plaza', '(1871)',
             'Chicago', 'IL', '60654'
         ],
     )
     self.assertEqual(
         tokenize('222 W Merchandise Mart Plaza (1871), Chicago IL 60654'),
         [
             '222', 'W', 'Merchandise', 'Mart', 'Plaza', '(1871),',
             'Chicago', 'IL', '60654'
         ])
     self.assertEqual(
         tokenize('222 W Merchandise Mart Plaza(1871) Chicago IL 60654'), [
             '222', 'W', 'Merchandise', 'Mart', 'Plaza', '(1871)',
             'Chicago', 'IL', '60654'
         ])
示例#15
0
    def test_nums(self):

        assert tokenize("222 W Merchandise Mart Plaza Chicago IL 60654") == [
            "222",
            "W",
            "Merchandise",
            "Mart",
            "Plaza",
            "Chicago",
            "IL",
            "60654",
        ]
示例#16
0
 def test_hash(self):
     self.assertEqual(
         tokenize('# 1 abc st'),
         ['#', '1', 'abc', 'st']
     )
     self.assertEqual(
         tokenize('#1 abc st'),
         ['#', '1', 'abc', 'st']
     )
     self.assertEqual(
         tokenize('box # 1 abc st'),
         ['box', '#', '1', 'abc', 'st']
     )
     self.assertEqual(
         tokenize('box #1 abc st'),
         ['box', '#', '1', 'abc', 'st']
     )
     self.assertEqual(
         tokenize('box# 1 abc st'),
         ['box', '#', '1', 'abc', 'st'],
     )
     self.assertEqual(
         tokenize('box#1 abc st'),
         ['box', '#', '1', 'abc', 'st']
     )
示例#17
0
 def test_hash(self):
     
     assert tokenize('# 1 abc st') == ['#', '1', 'abc', 'st']
     assert tokenize('#1 abc st') == ['#', '1', 'abc', 'st']
     assert tokenize('box # 1 abc st') == ['box', '#', '1', 'abc', 'st']
     assert tokenize('box #1 abc st') == ['box', '#', '1', 'abc', 'st']
     assert tokenize('box# 1 abc st') == ['box', '#', '1', 'abc', 'st']
     assert tokenize('box#1 abc st') == ['box', '#', '1', 'abc', 'st']
示例#18
0
    def test_hash(self):

        assert tokenize('# 1 abc st') == ['#', '1', 'abc', 'st']
        assert tokenize('#1 abc st') == ['#', '1', 'abc', 'st']
        assert tokenize('box # 1 abc st') == ['box', '#', '1', 'abc', 'st']
        assert tokenize('box #1 abc st') == ['box', '#', '1', 'abc', 'st']
        assert tokenize('box# 1 abc st') == ['box', '#', '1', 'abc', 'st']
        assert tokenize('box#1 abc st') == ['box', '#', '1', 'abc', 'st']
示例#19
0
    def test_hash(self):

        assert tokenize("# 1 abc st") == ["#", "1", "abc", "st"]
        assert tokenize("#1 abc st") == ["#", "1", "abc", "st"]
        assert tokenize("box # 1 abc st") == ["box", "#", "1", "abc", "st"]
        assert tokenize("box #1 abc st") == ["box", "#", "1", "abc", "st"]
        assert tokenize("box# 1 abc st") == ["box", "#", "1", "abc", "st"]
        assert tokenize("box#1 abc st") == ["box", "#", "1", "abc", "st"]
示例#20
0
 def fit(self, X, y, **params):
     # sklearn requires parameters to be declared as fields of the estimator,
     # an we can't have a full stop there. Replace with an underscore
     params = {k.replace('_', '.'): v for k, v in self.__dict__.items()}
     trainer = pycrfsuite.Trainer(verbose=False, params=params)
     for address, labels in zip(X, y):
         tokens = usaddress.tokenize(address)
         if len(tokens) != len(labels):
             # sometimes there are more/less gold standard labels than
             # the number of tags the system will output, This is because
             # our tokenizer works differently to how the data is tokenized.
             # Let's pretend this never happened
             continue
         trainer.append(usaddress.addr2features(tokens), labels)
     trainer.train(self.model_path)
     reload(usaddress)
示例#21
0
def list2xml(addr_list, outfile):
    xml_addr_list = etree.Element('AddressCollection')
    for addr in addr_list:
        xml_addr = etree.Element('AddressString')
        # handle commas?
        for component in addr:
            if component[1]:
                for token in tokenize(component[1]):
                    token_xml = etree.Element(component[0])
                    token_xml.text = token
                    token_xml.tail = ' '
                    xml_addr.append(token_xml)
        xml_addr[-1].tail = None
        xml_addr_list.append(xml_addr)

    output = etree.tostring(xml_addr_list, pretty_print=True)
    with open(outfile, 'w') as f:
        f.write(output)
示例#22
0
def list2xml(addr_list, outfile):
    xml_addr_list = etree.Element('AddressCollection')
    for addr in addr_list:
        xml_addr = etree.Element('AddressString')
        # handle commas?
        for component in addr:
            if component[1]:
                for token in tokenize(component[1]):
                    token_xml = etree.Element(component[0])
                    token_xml.text = token
                    token_xml.tail = ' '
                    xml_addr.append(token_xml)
        xml_addr[-1].tail = None
        xml_addr_list.append(xml_addr)

    output = etree.tostring(xml_addr_list, pretty_print=True)
    with open(outfile, 'w') as f:
        f.write(output)
示例#23
0
 def test_hash(self):
     self.assertEqual(tokenize('# 1 abc st'), ['#', '1', 'abc', 'st'])
     self.assertEqual(tokenize('#1 abc st'), ['#', '1', 'abc', 'st'])
     self.assertEqual(tokenize('box # 1 abc st'),
                      ['box', '#', '1', 'abc', 'st'])
     self.assertEqual(tokenize('box #1 abc st'),
                      ['box', '#', '1', 'abc', 'st'])
     self.assertEqual(
         tokenize('box# 1 abc st'),
         ['box', '#', '1', 'abc', 'st'],
     )
     self.assertEqual(tokenize('box#1 abc st'),
                      ['box', '#', '1', 'abc', 'st'])
示例#24
0
    def test_spaces(self):

        assert tokenize("1 abc st") == ["1", "abc", "st"]
        assert tokenize("1  abc st") == ["1", "abc", "st"]
        assert tokenize("1 abc st ") == ["1", "abc", "st"]
        assert tokenize(" 1 abc st") == ["1", "abc", "st"]
示例#25
0
 def test_ampersand(self):
     assert tokenize("123 & 456") == ["123", "&", "456"]
     assert tokenize("123&456") == ["123", "&", "456"]
     assert tokenize("123& 456") == ["123", "&", "456"]
     assert tokenize("123 &456") == ["123", "&", "456"]
     assert tokenize("123 & 456") == ["123", "&", "456"]
     assert tokenize("123&456") == ["123", "&", "456"]
     assert tokenize("123& 456") == ["123", "&", "456"]
     assert tokenize("123 &456") == ["123", "&", "456"]
     assert tokenize("123 & 456") == ["123", "&", "456"]
     assert tokenize("123&456") == ["123", "&", "456"]
     assert tokenize("123& 456") == ["123", "&", "456"]
     assert tokenize("123 &456") == ["123", "&", "456"]
示例#26
0
    def test_split_on_punc(self) :

        assert tokenize('1 abc st,suite 1') == ['1', 'abc', 'st,', 'suite', '1']
        assert tokenize('1 abc st;suite 1') == ['1', 'abc', 'st;', 'suite', '1']
        assert tokenize('1-5 abc road') == ['1-5', 'abc', 'road']
示例#27
0
    def test_spaces(self) :

        assert tokenize('1 abc st') == ['1', 'abc', 'st']
        assert tokenize('1  abc st') == ['1', 'abc', 'st']
        assert tokenize('1 abc st ') == ['1', 'abc', 'st']
        assert tokenize(' 1 abc st') == ['1', 'abc', 'st']
示例#28
0
 def test_nums(self):
     self.assertEqual(
         tokenize('222 W Merchandise Mart Plaza Chicago IL 60654'),
         ['222', 'W', 'Merchandise', 'Mart', 'Plaza', 'Chicago', 'IL',
          '60654']
     )
示例#29
0
    def test_nums(self) :

        assert tokenize('222 W Merchandise Mart Plaza Chicago IL 60654') == ['222', 'W', 'Merchandise', 'Mart', 'Plaza', 'Chicago', 'IL', '60654' ]
示例#30
0
    def test_spaces(self):

        assert tokenize('1 abc st') == ['1', 'abc', 'st']
        assert tokenize('1  abc st') == ['1', 'abc', 'st']
        assert tokenize('1 abc st ') == ['1', 'abc', 'st']
        assert tokenize(' 1 abc st') == ['1', 'abc', 'st']
示例#31
0
 def test_ampersand(self) :
     assert tokenize('123 & 456') == ['123', '&', '456']
示例#32
0
 def test_paren(self) :
     assert tokenize('222 W Merchandise Mart Plaza (1871) Chicago IL 60654') == ['222', 'W', 'Merchandise', 'Mart', 'Plaza', '(1871)', 'Chicago', 'IL', '60654' ]
     assert tokenize('222 W Merchandise Mart Plaza (1871), Chicago IL 60654') == ['222', 'W', 'Merchandise', 'Mart', 'Plaza', '(1871),', 'Chicago', 'IL', '60654' ]
     assert tokenize('222 W Merchandise Mart Plaza(1871) Chicago IL 60654') == ['222', 'W', 'Merchandise', 'Mart', 'Plaza', '(1871)', 'Chicago', 'IL', '60654' ]
示例#33
0
 def test_ampersand(self):
     self.assertEqual(tokenize('123 & 456'), ['123', '&', '456'])
     self.assertEqual(tokenize('123&456'), ['123', '&', '456'])
     self.assertEqual(tokenize('123& 456'), ['123', '&', '456'])
     self.assertEqual(tokenize('123 &456'), ['123', '&', '456'])
     self.assertEqual(tokenize('123 & 456'), ['123', '&', '456'])
     self.assertEqual(tokenize('123&456'), ['123', '&', '456'])
     self.assertEqual(tokenize('123& 456'), ['123', '&', '456'])
     self.assertEqual(tokenize('123 &456'), ['123', '&', '456'])
     self.assertEqual(tokenize('123 & 456'), ['123', '&', '456'])
     self.assertEqual(tokenize('123&456'), ['123', '&', '456'])
     self.assertEqual(tokenize('123& 456'), ['123', '&', '456'])
     self.assertEqual(tokenize('123 &456'), ['123', '&', '456'])
示例#34
0
    def test_split_on_punc(self):

        assert tokenize("1 abc st,suite 1") == ["1", "abc", "st,", "suite", "1"]
        assert tokenize("1 abc st;suite 1") == ["1", "abc", "st;", "suite", "1"]
        assert tokenize("1-5 abc road") == ["1-5", "abc", "road"]
示例#35
0
    def test_nums(self):

        assert tokenize('222 W Merchandise Mart Plaza Chicago IL 60654') == [
            '222', 'W', 'Merchandise', 'Mart', 'Plaza', 'Chicago', 'IL',
            '60654'
        ]
    def test_capture_punc(self) :

        assert tokenize('222 W. Merchandise Mart Plaza') == ['222', 'W.', 'Merchandise', 'Mart', 'Plaza']
        assert tokenize('222 W Merchandise Mart Plaza, Chicago, IL') == ['222', 'W', 'Merchandise', 'Mart', 'Plaza,', 'Chicago,', 'IL' ]
示例#37
0
 def test_ampersand(self):
     assert tokenize('123 & 456') == ['123', '&', '456']
示例#38
0
 def test_ampersand(self):
     self.assertEqual(
         tokenize('123 & 456'),
         ['123', '&', '456']
     )
     self.assertEqual(
         tokenize('123&456'),
         ['123', '&', '456']
     )
     self.assertEqual(
         tokenize('123& 456'),
         ['123', '&', '456']
     )
     self.assertEqual(
         tokenize('123 &456'),
         ['123', '&', '456']
     )
     self.assertEqual(
         tokenize('123 & 456'),
         ['123', '&', '456']
     )
     self.assertEqual(
         tokenize('123&456'),
         ['123', '&', '456']
     )
     self.assertEqual(
         tokenize('123& 456'),
         ['123', '&', '456']
     )
     self.assertEqual(
         tokenize('123 &456'),
         ['123', '&', '456']
     )
     self.assertEqual(
         tokenize('123 & 456'),
         ['123', '&', '456']
     )
     self.assertEqual(
         tokenize('123&456'),
         ['123', '&', '456']
     )
     self.assertEqual(
         tokenize('123& 456'),
         ['123', '&', '456']
     )
     self.assertEqual(
         tokenize('123 &456'),
         ['123', '&', '456']
     )
示例#39
0
 def test_nums(self):
     self.assertEqual(
         tokenize('222 W Merchandise Mart Plaza Chicago IL 60654'), [
             '222', 'W', 'Merchandise', 'Mart', 'Plaza', 'Chicago', 'IL',
             '60654'
         ])