示例#1
0
class GeonamesCacheTestSuite(unittest.TestCase):
    """GeonamesCache test cases."""
    def setUp(self):
        self.geonamescache = GeonamesCache()

    def test_continents(self):
        continents = self.geonamescache.get_continents()
        testdata = (('AF', 'Africa'), ('AN', 'Antarctica'), ('AS', 'Asia'),
                    ('EU', 'Europe'), ('NA', 'North America'),
                    ('OC', 'Oceania'), ('SA', 'South America'))
        for code, name in testdata:
            self.assertTrue(code in continents)
            self.assertEqual(name, continents[code]['name'])

        for code in ['XX', 'OO']:
            self.assertTrue(code not in continents)

    def test_get_countries(self):
        countries = self.geonamescache.get_countries()

        testdata = (('ES', 'Spain'), ('FR', 'France'), ('US', 'United States'))
        for code, name in testdata:
            self.assertTrue(code in countries)
            self.assertEqual(name, countries[code]['name'])

        for code in ['XX', 'OO']:
            self.assertTrue(code not in countries)

    def test_us_states(self):
        us_states = self.geonamescache.get_us_states()

        testdata = (('NM', 'New Mexico'), ('CA', 'California'), ('NV',
                                                                 'Nevada'))
        for code, name in testdata:
            self.assertTrue(code in us_states)
            self.assertEqual(name, us_states[code]['name'])

        for code in ['XX', 'OO']:
            self.assertTrue(code not in us_states)

    def test_get_countries_by_names(self):
        # Length of get_countries_by_names dict and get_countries dict must be
        # the same, unless country names wouldn't be unique.
        self.assertTrue(len(self.geonamescache.get_countries_by_names()),
                        len(self.geonamescache.get_countries()))

    def test_get_cities_by_name(self):
        cities = self.geonamescache.get_cities()
        for gid, name in (('3191316', 'Samobor'), ('3107112',
                                                   'Rivas-Vaciamadrid')):
            self.assertEqual(name, cities[gid]['name'])

    def test_get_cities_by_name_madrid(self):
        self.assertEqual(2,
                         len(self.geonamescache.get_cities_by_name('Madrid')))

    def test_us_counties_len(self):
        # Make sure there are 3235 counties, which includes Puerto Rico etc.
        us_counties = self.geonamescache.get_us_counties()
        self.assertEqual(3235, len(us_counties))
示例#2
0
 def geo_match2(location_names):
     """
     This function match US city names with corresponding coordinates, basically the same as
     coordinates_converting.py, check it for comments and description
     :param location_names: str, content of input file
     :return: str
     """
     output = {}
     gc = GeonamesCache()
     state_dic_abbr = gc.get_us_states()
     new_data = {
         'DC': [38.895, -77.0366667],
         'St. Paul': [44.9537, -93.0900]
     }
     with open("city_loca.json", 'r') as f2:
         for line in f2:
             datum = json.loads(line)
             if datum['CityNameAccented'] not in new_data:
                 new_data[datum['CityNameAccented']] = [
                     datum['Latitude'], datum['Longitude']
                 ]
     for i in location_names:
         for name in location_names[i]:
             if name in new_data:
                 output[i] = new_data[name]
                 break
             else:
                 continue
         if i not in new_data:
             full_state_name = ''
             for name in location_names[i]:
                 if name in state_dic_abbr:
                     full_state_name = state_dic_abbr[name]['name']
                 else:
                     if name in capital_dic:
                         full_state_name = name
                 if full_state_name:
                     tmp0 = capital_dic[full_state_name]
                     try:
                         output[i] = new_data[tmp0]
                         break
                     except:
                         continue
     # print(output)
     return output
# withoud cities 14954 location string remain unresovled
# with unique city names 6345 (number is not up to date) location string remain unresovled
# with largest city name 4623 location string remain unresovled
# with manually resolved locations 3333 location string remain unresovled

import csv, json, re
from geonamescache import GeonamesCache
from loclists import check_unresolved

unresolved_locations = []
commits_by_countries = {}
countries_by_locstr = {}
gc = GeonamesCache()
countries = gc.get_countries()
countries_by_names = gc.get_countries_by_names()
us_states = gc.get_us_states()
us_states_by_names = gc.get_us_states_by_names()

re_ignore = re.compile(r'[\.\(\)\d-]')
re_ws = re.compile(r'\s{2,}')


def test_locs(locs):
    for loc in locs:
        loc = loc.strip().lower()
        loctitle = loc.title()
        locupper = loc.upper()
        if loc in countries_by_names:
            return loc
        elif loctitle in countries_by_names:
            return loctitle
示例#4
0
class GeonamesCacheTestSuite(unittest.TestCase):
    """GeonamesCache test cases."""

    def setUp(self):
        self.geonamescache = GeonamesCache()

    def test_continents(self):
        continents = self.geonamescache.get_continents()
        testdata = (
            ('AF', 'Africa'),
            ('AN', 'Antarctica'),
            ('AS', 'Asia'),
            ('EU', 'Europe'),
            ('NA', 'North America'),
            ('OC', 'Oceania'),
            ('SA', 'South America')
        )
        for code, name in testdata:
            self.assertTrue(code in continents)
            self.assertEqual(name, continents[code]['name'])

        for code in ['XX', 'OO']:
            self.assertTrue(code not in continents)

    def test_get_countries(self):
        countries = self.geonamescache.get_countries()

        testdata = (('ES', 'Spain'), ('FR', 'France'), ('US', 'United States'))
        for code, name in testdata:
            self.assertTrue(code in countries)
            self.assertEqual(name, countries[code]['name'])

        for code in ['XX', 'OO']:
            self.assertTrue(code not in countries)

    def test_us_states(self):
        us_states = self.geonamescache.get_us_states()

        testdata = (
            ('NM', 'New Mexico'), ('CA', 'California'), ('NV', 'Nevada'))
        for code, name in testdata:
            self.assertTrue(code in us_states)
            self.assertEqual(name, us_states[code]['name'])

        for code in ['XX', 'OO']:
            self.assertTrue(code not in us_states)

    def test_get_countries_by_names(self):
        # Length of get_countries_by_names dict and get_countries dict must be
        # the same, unless country names wouldn't be unique.
        self.assertTrue(len(self.geonamescache.get_countries_by_names()),
                        len(self.geonamescache.get_countries()))

    def test_get_cities_by_name(self):
        cities = self.geonamescache.get_cities()
        for gid, name in (('3191316', 'Samobor'), ('3107112', 'Rivas-Vaciamadrid')):
            self.assertEqual(name, cities[gid]['name'])

    def test_get_cities_by_name_madrid(self):
        self.assertEqual(
            2, len(self.geonamescache.get_cities_by_name('Madrid')))

    def test_us_counties_len(self):
        # Make sure there are 3235 counties, which includes Puerto Rico etc.
        us_counties = self.geonamescache.get_us_counties()
        self.assertEqual(3234, len(us_counties))
def geo_match2(location_names):
    """
    This function match US city names with corresponding coordinates
    :param location_names: str, content of input file
    :return: str
    """
    def data_preprocess(data):
        """
        this function preprocess the input data, change the format and datatype of timestamp, and add multiple
        location names into a single list
        :param data: json dict
        :return: json dict
        """
        data_formal = {}
        for i in data:
            time = datetime.datetime.strptime(i[:19], "%Y-%m-%dT%H:%M:%S")
            # the pattern of timestamp could vary for different data source
            data_formal[str(time)] = [j[0] for j in data[i][0]]
        return data_formal

    location_names = data_preprocess(location_names)
    output = {}
    # load the coordinates items into a dictionary called new_data
    bad_items = []
    gc = GeonamesCache()
    state_dic_abbr = gc.get_us_states()
    new_data = {'DC': [38.895, -77.0366667], 'St. Paul': [44.9537, -93.0900], 'Temcula': [33.4936, -117.1484]}  # You can manually input some corner cases
    with open("city_loca.json", 'r') as f2:  # load coordinates from data source
        for line in f2:
            datum = json.loads(line)
            if datum['CityNameAccented'] not in new_data:
                new_data[datum['CityNameAccented']] = [datum['Latitude'], datum['Longitude']]
    # traverse through the extracted location names
    for i in location_names:
        s = len(output)
        for name in location_names[i]:
            if name:
                new = name.split(' ')  # split by space
            else:
                new = []
            name = ''
            for j in range(len(new)):
                if new[j] and new[j] != ' ':
                    new[j] = new[j][0].upper() + new[j][1:]  # capitalize the word
                name += new[j]
                if j != len(new) - 1:
                    name += ' '

            if name in new_data:  # deal common cases
                output[i] = new_data[name]
                break

            if name.split(' ')[-1] in state_list:  # deal with situation like "New York NY"
                separator = ' '
                name_city = separator.join(name.split(' ')[:-1])
                if name_city in new_data:
                    output[i] = new_data[name_city]
                    break

            if name.split(' ')[0] in directions:  # deal with situation like "South west NY"
                separator = ' '
                name_city = separator.join(name.split(' ')[1:])
                if name_city in new_data:
                    output[i] = new_data[name_city]
                    break
                else:
                    continue

        if i not in output:  # if the above method failed to match coordinates
            full_state_name = ''
            for name in location_names[i]:
                if name:
                    new = name.split(' ')
                else:
                    new = []
                name = ''
                for j in range(len(new)):
                    if new[j] and new[j] != ' ':
                        new[j] = new[j][0].upper() + new[j][1:]
                    name += new[j]
                    if j != len(new) - 1:
                        name += ' '
                if name in state_dic_abbr:
                    full_state_name = state_dic_abbr[name]['name']  # use the state name instead
                else:
                    if name in capital_dic:
                        full_state_name = name
                if full_state_name:
                    tmp0 = capital_dic[full_state_name]  # use capital city to match coordinates
                    try:
                        output[i] = new_data[tmp0]
                        break
                    except:
                        continue
        e = len(output)
        if s == e:
            bad_items.append((i, location_names[i]))  # record the location names that can't be converted
    print(bad_items)
    with open('coordinates_IstheServicedown_' + 'Verizon' + '.json', 'w') as outfile:
        json.dump(output, outfile)
    return 'done'
# withoud cities 14954 location string remain unresovled
# with unique city names 6345 (number is not up to date) location string remain unresovled
# with largest city name 4623 location string remain unresovled
# with manually resolved locations 3333 location string remain unresovled

import csv, json, re
from geonamescache import GeonamesCache
from loclists import check_unresolved

unresolved_locations = []
commits_by_countries = {}
countries_by_locstr = {}
gc = GeonamesCache()
countries = gc.get_countries()
countries_by_names = gc.get_countries_by_names()
us_states = gc.get_us_states()
us_states_by_names = gc.get_us_states_by_names()

re_ignore = re.compile(r'[\.\(\)\d-]')
re_ws = re.compile(r'\s{2,}')


def test_locs(locs):
    for loc in locs:
        loc = loc.strip().lower()
        loctitle = loc.title()
        locupper = loc.upper()
        if loc in countries_by_names:
            return loc
        elif loctitle in countries_by_names:
            return loctitle