class GeonamesCacheTestSuite(unittest.TestCase): """GeonamesCache test cases.""" def setUp(self): self.geonamescache = GeonamesCache() def test_continents(self): continents = self.geonamescache.get_continents() testdata = (('AF', 'Africa'), ('AN', 'Antarctica'), ('AS', 'Asia'), ('EU', 'Europe'), ('NA', 'North America'), ('OC', 'Oceania'), ('SA', 'South America')) for code, name in testdata: self.assertTrue(code in continents) self.assertEqual(name, continents[code]['name']) for code in ['XX', 'OO']: self.assertTrue(code not in continents) def test_get_countries(self): countries = self.geonamescache.get_countries() testdata = (('ES', 'Spain'), ('FR', 'France'), ('US', 'United States')) for code, name in testdata: self.assertTrue(code in countries) self.assertEqual(name, countries[code]['name']) for code in ['XX', 'OO']: self.assertTrue(code not in countries) def test_us_states(self): us_states = self.geonamescache.get_us_states() testdata = (('NM', 'New Mexico'), ('CA', 'California'), ('NV', 'Nevada')) for code, name in testdata: self.assertTrue(code in us_states) self.assertEqual(name, us_states[code]['name']) for code in ['XX', 'OO']: self.assertTrue(code not in us_states) def test_get_countries_by_names(self): # Length of get_countries_by_names dict and get_countries dict must be # the same, unless country names wouldn't be unique. self.assertTrue(len(self.geonamescache.get_countries_by_names()), len(self.geonamescache.get_countries())) def test_get_cities_by_name(self): cities = self.geonamescache.get_cities() for gid, name in (('3191316', 'Samobor'), ('3107112', 'Rivas-Vaciamadrid')): self.assertEqual(name, cities[gid]['name']) def test_get_cities_by_name_madrid(self): self.assertEqual(2, len(self.geonamescache.get_cities_by_name('Madrid'))) def test_us_counties_len(self): # Make sure there are 3235 counties, which includes Puerto Rico etc. us_counties = self.geonamescache.get_us_counties() self.assertEqual(3235, len(us_counties))
def geo_match2(location_names): """ This function match US city names with corresponding coordinates, basically the same as coordinates_converting.py, check it for comments and description :param location_names: str, content of input file :return: str """ output = {} gc = GeonamesCache() state_dic_abbr = gc.get_us_states() new_data = { 'DC': [38.895, -77.0366667], 'St. Paul': [44.9537, -93.0900] } with open("city_loca.json", 'r') as f2: for line in f2: datum = json.loads(line) if datum['CityNameAccented'] not in new_data: new_data[datum['CityNameAccented']] = [ datum['Latitude'], datum['Longitude'] ] for i in location_names: for name in location_names[i]: if name in new_data: output[i] = new_data[name] break else: continue if i not in new_data: full_state_name = '' for name in location_names[i]: if name in state_dic_abbr: full_state_name = state_dic_abbr[name]['name'] else: if name in capital_dic: full_state_name = name if full_state_name: tmp0 = capital_dic[full_state_name] try: output[i] = new_data[tmp0] break except: continue # print(output) return output
# withoud cities 14954 location string remain unresovled # with unique city names 6345 (number is not up to date) location string remain unresovled # with largest city name 4623 location string remain unresovled # with manually resolved locations 3333 location string remain unresovled import csv, json, re from geonamescache import GeonamesCache from loclists import check_unresolved unresolved_locations = [] commits_by_countries = {} countries_by_locstr = {} gc = GeonamesCache() countries = gc.get_countries() countries_by_names = gc.get_countries_by_names() us_states = gc.get_us_states() us_states_by_names = gc.get_us_states_by_names() re_ignore = re.compile(r'[\.\(\)\d-]') re_ws = re.compile(r'\s{2,}') def test_locs(locs): for loc in locs: loc = loc.strip().lower() loctitle = loc.title() locupper = loc.upper() if loc in countries_by_names: return loc elif loctitle in countries_by_names: return loctitle
class GeonamesCacheTestSuite(unittest.TestCase): """GeonamesCache test cases.""" def setUp(self): self.geonamescache = GeonamesCache() def test_continents(self): continents = self.geonamescache.get_continents() testdata = ( ('AF', 'Africa'), ('AN', 'Antarctica'), ('AS', 'Asia'), ('EU', 'Europe'), ('NA', 'North America'), ('OC', 'Oceania'), ('SA', 'South America') ) for code, name in testdata: self.assertTrue(code in continents) self.assertEqual(name, continents[code]['name']) for code in ['XX', 'OO']: self.assertTrue(code not in continents) def test_get_countries(self): countries = self.geonamescache.get_countries() testdata = (('ES', 'Spain'), ('FR', 'France'), ('US', 'United States')) for code, name in testdata: self.assertTrue(code in countries) self.assertEqual(name, countries[code]['name']) for code in ['XX', 'OO']: self.assertTrue(code not in countries) def test_us_states(self): us_states = self.geonamescache.get_us_states() testdata = ( ('NM', 'New Mexico'), ('CA', 'California'), ('NV', 'Nevada')) for code, name in testdata: self.assertTrue(code in us_states) self.assertEqual(name, us_states[code]['name']) for code in ['XX', 'OO']: self.assertTrue(code not in us_states) def test_get_countries_by_names(self): # Length of get_countries_by_names dict and get_countries dict must be # the same, unless country names wouldn't be unique. self.assertTrue(len(self.geonamescache.get_countries_by_names()), len(self.geonamescache.get_countries())) def test_get_cities_by_name(self): cities = self.geonamescache.get_cities() for gid, name in (('3191316', 'Samobor'), ('3107112', 'Rivas-Vaciamadrid')): self.assertEqual(name, cities[gid]['name']) def test_get_cities_by_name_madrid(self): self.assertEqual( 2, len(self.geonamescache.get_cities_by_name('Madrid'))) def test_us_counties_len(self): # Make sure there are 3235 counties, which includes Puerto Rico etc. us_counties = self.geonamescache.get_us_counties() self.assertEqual(3234, len(us_counties))
def geo_match2(location_names): """ This function match US city names with corresponding coordinates :param location_names: str, content of input file :return: str """ def data_preprocess(data): """ this function preprocess the input data, change the format and datatype of timestamp, and add multiple location names into a single list :param data: json dict :return: json dict """ data_formal = {} for i in data: time = datetime.datetime.strptime(i[:19], "%Y-%m-%dT%H:%M:%S") # the pattern of timestamp could vary for different data source data_formal[str(time)] = [j[0] for j in data[i][0]] return data_formal location_names = data_preprocess(location_names) output = {} # load the coordinates items into a dictionary called new_data bad_items = [] gc = GeonamesCache() state_dic_abbr = gc.get_us_states() new_data = {'DC': [38.895, -77.0366667], 'St. Paul': [44.9537, -93.0900], 'Temcula': [33.4936, -117.1484]} # You can manually input some corner cases with open("city_loca.json", 'r') as f2: # load coordinates from data source for line in f2: datum = json.loads(line) if datum['CityNameAccented'] not in new_data: new_data[datum['CityNameAccented']] = [datum['Latitude'], datum['Longitude']] # traverse through the extracted location names for i in location_names: s = len(output) for name in location_names[i]: if name: new = name.split(' ') # split by space else: new = [] name = '' for j in range(len(new)): if new[j] and new[j] != ' ': new[j] = new[j][0].upper() + new[j][1:] # capitalize the word name += new[j] if j != len(new) - 1: name += ' ' if name in new_data: # deal common cases output[i] = new_data[name] break if name.split(' ')[-1] in state_list: # deal with situation like "New York NY" separator = ' ' name_city = separator.join(name.split(' ')[:-1]) if name_city in new_data: output[i] = new_data[name_city] break if name.split(' ')[0] in directions: # deal with situation like "South west NY" separator = ' ' name_city = separator.join(name.split(' ')[1:]) if name_city in new_data: output[i] = new_data[name_city] break else: continue if i not in output: # if the above method failed to match coordinates full_state_name = '' for name in location_names[i]: if name: new = name.split(' ') else: new = [] name = '' for j in range(len(new)): if new[j] and new[j] != ' ': new[j] = new[j][0].upper() + new[j][1:] name += new[j] if j != len(new) - 1: name += ' ' if name in state_dic_abbr: full_state_name = state_dic_abbr[name]['name'] # use the state name instead else: if name in capital_dic: full_state_name = name if full_state_name: tmp0 = capital_dic[full_state_name] # use capital city to match coordinates try: output[i] = new_data[tmp0] break except: continue e = len(output) if s == e: bad_items.append((i, location_names[i])) # record the location names that can't be converted print(bad_items) with open('coordinates_IstheServicedown_' + 'Verizon' + '.json', 'w') as outfile: json.dump(output, outfile) return 'done'
# withoud cities 14954 location string remain unresovled # with unique city names 6345 (number is not up to date) location string remain unresovled # with largest city name 4623 location string remain unresovled # with manually resolved locations 3333 location string remain unresovled import csv, json, re from geonamescache import GeonamesCache from loclists import check_unresolved unresolved_locations = [] commits_by_countries = {} countries_by_locstr = {} gc = GeonamesCache() countries = gc.get_countries() countries_by_names = gc.get_countries_by_names() us_states = gc.get_us_states() us_states_by_names = gc.get_us_states_by_names() re_ignore = re.compile(r'[\.\(\)\d-]') re_ws = re.compile(r'\s{2,}') def test_locs(locs): for loc in locs: loc = loc.strip().lower() loctitle = loc.title() locupper = loc.upper() if loc in countries_by_names: return loc elif loctitle in countries_by_names: return loctitle