def __init__(self): self.stateFullNames = [] self.countryFullNames = [] self.stateAbbreviationToFullName = dict() self.countryAbbreviationToFullName = dict() self.geocodeLocationResolver = None self.useUnknownPlaces = True self.newLocationIndex = Constants.NEW_LOCATION_STARTING_INDEX self.statePattern = ".+,\\s*(\\w+)" self.placeNameToNormalizedPlaceName = dict() self.idToLocation = dict() self.locationNameToLocation = dict() self.locationToParent = dict() self.locationToChildren = dict() self.locationToId = dict() self.usePlace = CarmenProperties["use_place"] self.useGeocodes = CarmenProperties["use_geocodes"] self.useUserString = CarmenProperties["use_user_string"] self.useUnknownPlaces = CarmenProperties["use_unknown_places"] logger.info("Geocoding using these resources:") if self.usePlace: logger.info('place') if self.useGeocodes: logger.info('geocodes') if self.useUserString: logger.info("user profile") logger.info("Loading location resources.") self.loadLocationFile(os.path.join(root,CarmenProperties["locations"])) knownLocations = [] for location in self.idToLocation.values(): knownLocations.append(location) self.idToLocation[-1] = Location.getNoneLocation() for location in knownLocations: parent = self.createParentOfLocation(location) if parent and not parent.isNone(): self.locationToParent[location] = parent if not self.locationToChildren.has_key(parent): self.locationToChildren[parent] = [] self.locationToChildren[parent].append(location) currentLocation = parent parent = self.createParentOfLocation(currentLocation) while parent and not parent.isNone(): if not self.locationToParent.has_key(currentLocation): self.locationToParent[currentLocation] = parent if not self.locationToChildren.has_key(parent): self.locationToChildren[parent] = [] self.locationToChildren[parent].append(currentLocation) currentLocation = parent parent = self.createParentOfLocation(currentLocation) if self.usePlace: self.loadNameAndAbbreviation(os.path.join(root,CarmenProperties["place_name_mapping"]), None, self.placeNameToNormalizedPlaceName, False) self.loadNameAndAbbreviation(os.path.join(root,CarmenProperties["state_names_file"]), self.stateFullNames, self.stateAbbreviationToFullName, True); self.loadNameAndAbbreviation(os.path.join(root,CarmenProperties["country_names_file"]), self.countryFullNames, self.countryAbbreviationToFullName, True); if self.useGeocodes: self.geocodeLocationResolver = GeocodeLocationResolver() for location in self.idToLocation.values(): if location.latitude and location.longitude: self.geocodeLocationResolver.addLocation(location)
def __init__(self): self.stateFullNames = [] self.countryFullNames = [] self.stateAbbreviationToFullName = dict() self.countryAbbreviationToFullName = dict() self.geocodeLocationResolver = None self.useUnknownPlaces = True self.newLocationIndex = Constants.NEW_LOCATION_STARTING_INDEX self.statePattern = ".+,\\s*(\\w+)" self.placeNameToNormalizedPlaceName = dict() self.idToLocation = dict() self.locationNameToLocation = dict() self.locationToParent = dict() self.locationToChildren = dict() self.locationToId = dict() self.usePlace = CarmenProperties["use_place"] self.useGeocodes = CarmenProperties["use_geocodes"] self.useUserString = CarmenProperties["use_user_string"] self.useUnknownPlaces = CarmenProperties["use_unknown_places"] logger.info("Geocoding using these resources:") if self.usePlace: logger.info('place') if self.useGeocodes: logger.info('geocodes') if self.useUserString: logger.info("user profile") logger.info("Loading location resources.") self.loadLocationFile(os.path.join(root, CarmenProperties["locations"])) knownLocations = [] for location in self.idToLocation.values(): knownLocations.append(location) self.idToLocation[-1] = Location.getNoneLocation() for location in knownLocations: parent = self.createParentOfLocation(location) if parent and not parent.isNone(): self.locationToParent[location] = parent if not self.locationToChildren.has_key(parent): self.locationToChildren[parent] = [] self.locationToChildren[parent].append(location) currentLocation = parent parent = self.createParentOfLocation(currentLocation) while parent and not parent.isNone(): if not self.locationToParent.has_key(currentLocation): self.locationToParent[currentLocation] = parent if not self.locationToChildren.has_key(parent): self.locationToChildren[parent] = [] self.locationToChildren[parent].append(currentLocation) currentLocation = parent parent = self.createParentOfLocation(currentLocation) if self.usePlace: self.loadNameAndAbbreviation( os.path.join(root, CarmenProperties["place_name_mapping"]), None, self.placeNameToNormalizedPlaceName, False) self.loadNameAndAbbreviation( os.path.join(root, CarmenProperties["state_names_file"]), self.stateFullNames, self.stateAbbreviationToFullName, True) self.loadNameAndAbbreviation( os.path.join(root, CarmenProperties["country_names_file"]), self.countryFullNames, self.countryAbbreviationToFullName, True) if self.useGeocodes: self.geocodeLocationResolver = GeocodeLocationResolver() for location in self.idToLocation.values(): if location.latitude and location.longitude: self.geocodeLocationResolver.addLocation(location)
class LocationResolver(object): resolver = None def __init__(self): self.stateFullNames = [] self.countryFullNames = [] self.stateAbbreviationToFullName = dict() self.countryAbbreviationToFullName = dict() self.geocodeLocationResolver = None self.useUnknownPlaces = True self.newLocationIndex = Constants.NEW_LOCATION_STARTING_INDEX self.statePattern = ".+,\\s*(\\w+)" self.placeNameToNormalizedPlaceName = dict() self.idToLocation = dict() self.locationNameToLocation = dict() self.locationToParent = dict() self.locationToChildren = dict() self.locationToId = dict() self.usePlace = CarmenProperties["use_place"] self.useGeocodes = CarmenProperties["use_geocodes"] self.useUserString = CarmenProperties["use_user_string"] self.useUnknownPlaces = CarmenProperties["use_unknown_places"] logger.info("Geocoding using these resources:") if self.usePlace: logger.info('place') if self.useGeocodes: logger.info('geocodes') if self.useUserString: logger.info("user profile") logger.info("Loading location resources.") self.loadLocationFile(os.path.join(root,CarmenProperties["locations"])) knownLocations = [] for location in self.idToLocation.values(): knownLocations.append(location) self.idToLocation[-1] = Location.getNoneLocation() for location in knownLocations: parent = self.createParentOfLocation(location) if parent and not parent.isNone(): self.locationToParent[location] = parent if not self.locationToChildren.has_key(parent): self.locationToChildren[parent] = [] self.locationToChildren[parent].append(location) currentLocation = parent parent = self.createParentOfLocation(currentLocation) while parent and not parent.isNone(): if not self.locationToParent.has_key(currentLocation): self.locationToParent[currentLocation] = parent if not self.locationToChildren.has_key(parent): self.locationToChildren[parent] = [] self.locationToChildren[parent].append(currentLocation) currentLocation = parent parent = self.createParentOfLocation(currentLocation) if self.usePlace: self.loadNameAndAbbreviation(os.path.join(root,CarmenProperties["place_name_mapping"]), None, self.placeNameToNormalizedPlaceName, False) self.loadNameAndAbbreviation(os.path.join(root,CarmenProperties["state_names_file"]), self.stateFullNames, self.stateAbbreviationToFullName, True); self.loadNameAndAbbreviation(os.path.join(root,CarmenProperties["country_names_file"]), self.countryFullNames, self.countryAbbreviationToFullName, True); if self.useGeocodes: self.geocodeLocationResolver = GeocodeLocationResolver() for location in self.idToLocation.values(): if location.latitude and location.longitude: self.geocodeLocationResolver.addLocation(location) # print self.placeNameToNormalizedPlaceName # for location in self.locationNameToLocation.keys(): # print location @staticmethod def getLocationResolver(): if not LocationResolver.resolver: LocationResolver.resolver = LocationResolver() return LocationResolver.resolver def createParentOfLocation(self, location): parentLocation = None if location.getCity(): parentLocation = Location(location.getCountry(), location.getState(), location.getCounty(), None, None, None, -1, False) elif location.getCountry(): parentLocation = Location(location.getCountry(), location.getState(), None, None, None, None, -1, False) elif location.getState(): parentLocation = Location(location.getCountry(), None, None, None, None, None, -1, False) elif location.getCountry() and location.getCountry().lower() != Constants.DS_LOCATION_NONE.lower(): parentLocation = Location.getNoneLocation() if not parentLocation: return None if self.locationToId.has_key(parentLocation): return self.idToLocation[self.locationToId[parentLocation]] self.registerNewLocation(parentLocation) return parentLocation def getPlaceFromTweet(self, tweet): if tweet.has_key('place'): return tweet['place'] else: return None def getUserFromTweet(tweet): if tweet.has_key('user'): return tweet['user'] else: return None def getLocationFromTweet(tweet): user = getUserFromTweet(tweet) if user: location = lookup(user,'location') if location: return location return None def getLatLngFromTweet(tweet): return Utils.geo_check_tweet(tweet) def loadNameAndAbbreviation(self, filename, fullName, abbreviations, secondColumnKey): for line in open(filename).readlines(): splitString = line.lower().split('\t') splitString[0] = splitString[0].strip() splitString[1] = splitString[1].strip() if fullName != None: fullName.append(splitString[0]) if (abbreviations != None): if not secondColumnKey: abbreviations[splitString[0]] = splitString[1] else: abbreviations[splitString[1]] = splitString[0] def setUseUnknownPlaces(self, useUnknownPlaces): self.useUnknownPlaces = useUnknownPlaces def resolveLocationFromTweet(self, tweet): location = None if self.usePlace: location = self.resolveLocationUsingPlace(tweet) if location: location.setResolutionMethod(ResolutionMethod.PLACE) if not self.useUnknownPlaces and not location.isKnownLocation(): location = None elif self.useUnknownPlaces and not location.isKnownLocation(): self.registerNewLocation(location) if not location and self.useGeocodes: location = self.resolveLocationUsingGeocodes(tweet) if location: location.setResolutionMethod(ResolutionMethod.COORDINATES) if not location and self.useUserString: location = self.resolveLocationUsingUserLocation(tweet) if location: location.setResolutionMethod(ResolutionMethod.USER_LOCATION) return location def isUseUnknownPlaces(self): return self.useUnknownPlaces def resolveLocationUsingPlace(self, tweet): place = self.getPlaceFromTweet(tweet) if place == None: return None url = lookup(place, 'url') id = lookup(place, 'id') country = lookup(place, 'country') if country == None: logger.warn("Found place with no country: {}".format(place)) return None if self.placeNameToNormalizedPlaceName.has_key(country.lower): country = placeNameToNormalizedPlaceName[country.lower] placeType = lookup(place, 'place_type') if placeType.lower() == 'city': city = lookup(place, 'name') if country.lower() == 'united states': fullName = lookup(place, 'full_name') state = None if not fullName: logger.warn("Found place with no full_name: {}".format(place)) return None match = re.search(self.statePattern, fullName) if match: matchedString = match.group().lower().split()[1].strip() if self.stateAbbreviationToFullName.has_key(matchedString): state = self.stateAbbreviationToFullName[matchedString] else: st_matches = [st for st in self.stateAbbreviationToFullName.values() if st == matchedString] if len(st_matches) > 0: state = st_matches[0] return self.getLocationForPlace(country, state, None, city, url, id) else: return self.getLocationForPlace(country, None, None, city, url, id) elif placeType.lower() == 'admin': state = lookup(place, 'name') return self.getLocationForPlace(country, state, None, None, url, id) elif placeType.lower() == 'country': return self.getLocationForPlace(country, None, None, None, url, id) elif placeType.lower() == 'neighborhood' or placeType.lower() == 'poi': fullName = lookup(place, 'full_name') if not fullName: logger.warn("Found place with no full_name: {}".format(place)) return None splitFullName = fullName.split(',') city = None if len(splitFullName) > 1: city = splitFullName[1] return self.getLocationForPlace(country, None, None, city, url, id) else: logger.warn("Unknown place type: {}".format(placeType)) def resolveLocationUsingGeocodes(self, tweet): return self.geocodeLocationResolver.resolveLocation(tweet) def extract_state(self, matchedString): state = None if self.stateAbbreviationToFullName.has_key(matchedString): state = self.stateAbbreviationToFullName[matchedString] else: st_matches = [st for st in self.stateAbbreviationToFullName.values() if st == matchedString] if len(st_matches) > 0: state = st_matches[0] return state def extract_country(self, matchedString): co = None if self.countryAbbreviationToFullName.has_key(matchedString): co = self.countryAbbreviationToFullName[matchedString] else: co_matches = [co for co in self.countryAbbreviationToFullName.values() if co == matchedString] if len(co_matches) > 0: co = co_matches[0] return co def resolveLocationUsingUserLocation(self, tweet): tweetLocation = Utils.getLocationFromTweet(tweet) if tweetLocation: var = re.sub("\\p{Punct}", " ", tweetLocation) location = re.sub("\\s+", " ", var).lower().strip() if self.locationNameToLocation.has_key(location): return self.locationNameToLocation[location] var = re.sub("[!\\\"#$%&'\\(\\)\\*\\+-\\./:;<=>\\?@\\[\\\\]^_`\\{\\|\\}~]", " ", tweetLocation) locationWithComma = re.sub("\\s+", " ", var).lower().strip() match = re.search(".+,\\s*(\\w+)", locationWithComma) if match: matchedString = match.group().lower() if self.locationNameToLocation.has_key(matchedString): return self.locationNameToLocation[matchedString] else: items = matchedString.split(',') if len(items) == 2: state = self.extract_state(items[1].strip()) if state: return Location('united states', state, matchedString.split(',')[0], None, None, None, -1, False) else: co = self.extract_country(items[1].strip()) return Location(co, None, None, None, None, None, -1, False) else: co = self.extract_country(items[2].strip()) if co: state = self.extract_state(items[1].strip()) if state: return Location(co, state, items[0].strip(), None, None, None, -1, False) else: return Location(co, None, None, None, None, None, -1, False) else: state = self.extract_state(items[2].strip()) if state: return Location('united states', state, None, None, None, None, -1, False) return None def getLocationForId(self, id): if self.idToLocation.has_key(id): return self.idToLocation[id] def loadLocationToIdFile(self, filename): map = dict() for line in open(filename).readlines(): line = line.lower() splitString = line.split('\t') locationId = int(splitString[0].trim()) for ii in range(len(splitString)): entry = splitString[ii].strip() if map.has_key(entry) and not map[entry] == locationId: logger.warn('Duplicate location found: {}'.format(entry)) map[entry] = locationId return map def loadLocationFile(self, filename): for line in open(filename).readlines(): locationObj = json.loads(line) location = Location.parseLocationFromJsonObj(locationObj) aliases = locationObj['aliases'] self.idToLocation[location.getId()] = location self.locationToId[location] = location.getId() justAddedAliases = [] for alias in aliases: if alias in justAddedAliases: continue if self.locationNameToLocation.has_key(alias): logger.warn("Duplicate location name: {}".format(alias)) else: self.locationNameToLocation[alias] = location justAddedAliases.append(alias) var = re.sub("\\p{Punct}", " ", alias) newEntry = re.sub("\\s+", " ", var) if newEntry in justAddedAliases: continue if newEntry != alias: if self.locationNameToLocation.has_key(newEntry): logger.warn("Duplicate location name: {}".format(newEntry)) else: self.locationNameToLocation[newEntry] = location justAddedAliases.append(newEntry) def getLocationForPlace(self, country, state, county, city, url, id): location = Location(country, state, county, city, 0, 0, -1, False) if self.locationToId.has_key(location): return self.idToLocation[self.locationToId[location]] location.setUrl(url) location.setTwitterId(id) return location def registerNewLocation(self, location): index = self.newLocationIndex self.newLocationIndex += 1 location.setId(index) self.locationToId[location] = index self.idToLocation[index] = location parent = self.createParentOfLocation(location) if parent: self.locationToParent[location] = parent if not self.locationToChildren.has_key(parent): self.locationToChildren[parent] = [] self.locationToChildren[parent].append(location) def getParent(self, location): if self.locationToParent.has_key(location): return self.locationToParent[location] return None def getChildren(self, location): if self.locationToChildren.has_key(location): return self.locationToChildren[location] return None def lookupLocation(self, location): if self.locationToId.has_key(location): return self.idToLocation[self.locationToId[location]] self.registerNewLocation(location) return location
class LocationResolver(object): resolver = None def __init__(self): self.stateFullNames = [] self.countryFullNames = [] self.stateAbbreviationToFullName = dict() self.countryAbbreviationToFullName = dict() self.geocodeLocationResolver = None self.useUnknownPlaces = True self.newLocationIndex = Constants.NEW_LOCATION_STARTING_INDEX self.statePattern = ".+,\\s*(\\w+)" self.placeNameToNormalizedPlaceName = dict() self.idToLocation = dict() self.locationNameToLocation = dict() self.locationToParent = dict() self.locationToChildren = dict() self.locationToId = dict() self.usePlace = CarmenProperties["use_place"] self.useGeocodes = CarmenProperties["use_geocodes"] self.useUserString = CarmenProperties["use_user_string"] self.useUnknownPlaces = CarmenProperties["use_unknown_places"] logger.info("Geocoding using these resources:") if self.usePlace: logger.info('place') if self.useGeocodes: logger.info('geocodes') if self.useUserString: logger.info("user profile") logger.info("Loading location resources.") self.loadLocationFile(os.path.join(root, CarmenProperties["locations"])) knownLocations = [] for location in self.idToLocation.values(): knownLocations.append(location) self.idToLocation[-1] = Location.getNoneLocation() for location in knownLocations: parent = self.createParentOfLocation(location) if parent and not parent.isNone(): self.locationToParent[location] = parent if not self.locationToChildren.has_key(parent): self.locationToChildren[parent] = [] self.locationToChildren[parent].append(location) currentLocation = parent parent = self.createParentOfLocation(currentLocation) while parent and not parent.isNone(): if not self.locationToParent.has_key(currentLocation): self.locationToParent[currentLocation] = parent if not self.locationToChildren.has_key(parent): self.locationToChildren[parent] = [] self.locationToChildren[parent].append(currentLocation) currentLocation = parent parent = self.createParentOfLocation(currentLocation) if self.usePlace: self.loadNameAndAbbreviation( os.path.join(root, CarmenProperties["place_name_mapping"]), None, self.placeNameToNormalizedPlaceName, False) self.loadNameAndAbbreviation( os.path.join(root, CarmenProperties["state_names_file"]), self.stateFullNames, self.stateAbbreviationToFullName, True) self.loadNameAndAbbreviation( os.path.join(root, CarmenProperties["country_names_file"]), self.countryFullNames, self.countryAbbreviationToFullName, True) if self.useGeocodes: self.geocodeLocationResolver = GeocodeLocationResolver() for location in self.idToLocation.values(): if location.latitude and location.longitude: self.geocodeLocationResolver.addLocation(location) # print self.placeNameToNormalizedPlaceName # for location in self.locationNameToLocation.keys(): # print location @staticmethod def getLocationResolver(): if not LocationResolver.resolver: LocationResolver.resolver = LocationResolver() return LocationResolver.resolver def createParentOfLocation(self, location): parentLocation = None if location.getCity(): parentLocation = Location(location.getCountry(), location.getState(), location.getCounty(), None, None, None, -1, False) elif location.getCountry(): parentLocation = Location(location.getCountry(), location.getState(), None, None, None, None, -1, False) elif location.getState(): parentLocation = Location(location.getCountry(), None, None, None, None, None, -1, False) elif location.getCountry() and location.getCountry().lower( ) != Constants.DS_LOCATION_NONE.lower(): parentLocation = Location.getNoneLocation() if not parentLocation: return None if self.locationToId.has_key(parentLocation): return self.idToLocation[self.locationToId[parentLocation]] self.registerNewLocation(parentLocation) return parentLocation def getPlaceFromTweet(self, tweet): if tweet.has_key('place'): return tweet['place'] else: return None def getUserFromTweet(self, tweet): if tweet.has_key('user'): return tweet['user'] else: return None def getLocationFromTweet(tweet): user = getUserFromTweet(tweet) if user: location = lookup(user, 'location') if location: return location return None def getLatLngFromTweet(tweet): return Utils.geo_check_tweet(tweet) def loadNameAndAbbreviation(self, filename, fullName, abbreviations, secondColumnKey): for line in open(filename).readlines(): splitString = line.lower().split('\t') splitString[0] = splitString[0].strip() splitString[1] = splitString[1].strip() if fullName != None: fullName.append(splitString[0]) if (abbreviations != None): if not secondColumnKey: abbreviations[splitString[0]] = splitString[1] else: abbreviations[splitString[1]] = splitString[0] def setUseUnknownPlaces(self, useUnknownPlaces): self.useUnknownPlaces = useUnknownPlaces def resolveLocationFromTweet(self, tweet): location = None if self.usePlace: location = self.resolveLocationUsingPlace(tweet) if location: location.setResolutionMethod(ResolutionMethod.PLACE) if not self.useUnknownPlaces and not location.isKnownLocation( ): location = None elif self.useUnknownPlaces and not location.isKnownLocation(): self.registerNewLocation(location) if not location and self.useGeocodes: location = self.resolveLocationUsingGeocodes(tweet) if location: location.setResolutionMethod(ResolutionMethod.COORDINATES) if not location and self.useUserString: location = self.resolveLocationUsingUserLocation(tweet) if location: location.setResolutionMethod(ResolutionMethod.USER_LOCATION) return location def isUseUnknownPlaces(self): return self.useUnknownPlaces def resolveLocationUsingPlace(self, tweet): place = self.getPlaceFromTweet(tweet) if place == None: return None url = lookup(place, 'url') id = lookup(place, 'id') country = lookup(place, 'country') if country == None: logger.warn("Found place with no country: {}".format(place)) return None if self.placeNameToNormalizedPlaceName.has_key(country.lower): country = placeNameToNormalizedPlaceName[country.lower] placeType = lookup(place, 'place_type') if placeType.lower() == 'city': city = lookup(place, 'name') if country.lower() == 'united states': fullName = lookup(place, 'full_name') state = None if not fullName: logger.warn( "Found place with no full_name: {}".format(place)) return None match = re.search(self.statePattern, fullName) if match: matchedString = match.group().lower().split()[1].strip() if self.stateAbbreviationToFullName.has_key(matchedString): state = self.stateAbbreviationToFullName[matchedString] else: st_matches = [ st for st in self.stateAbbreviationToFullName.values() if st == matchedString ] if len(st_matches) > 0: state = st_matches[0] return self.getLocationForPlace(country, state, None, city, url, id) else: return self.getLocationForPlace(country, None, None, city, url, id) elif placeType.lower() == 'admin': state = lookup(place, 'name') return self.getLocationForPlace(country, state, None, None, url, id) elif placeType.lower() == 'country': return self.getLocationForPlace(country, None, None, None, url, id) elif placeType.lower() == 'neighborhood' or placeType.lower() == 'poi': fullName = lookup(place, 'full_name') if not fullName: logger.warn("Found place with no full_name: {}".format(place)) return None splitFullName = fullName.split(',') city = None if len(splitFullName) > 1: city = splitFullName[1] return self.getLocationForPlace(country, None, None, city, url, id) else: logger.warn("Unknown place type: {}".format(placeType)) def resolveLocationUsingGeocodes(self, tweet): return self.geocodeLocationResolver.resolveLocation(tweet) def extract_state(self, matchedString): state = None if self.stateAbbreviationToFullName.has_key(matchedString): state = self.stateAbbreviationToFullName[matchedString] else: st_matches = [ st for st in self.stateAbbreviationToFullName.values() if st == matchedString ] if len(st_matches) > 0: state = st_matches[0] return state def extract_country(self, matchedString): co = None if self.countryAbbreviationToFullName.has_key(matchedString): co = self.countryAbbreviationToFullName[matchedString] else: co_matches = [ co for co in self.countryAbbreviationToFullName.values() if co == matchedString ] if len(co_matches) > 0: co = co_matches[0] return co def resolveLocationUsingUserLocation(self, tweet): tweetLocation = Utils.getLocationFromTweet(tweet) if tweetLocation: var = re.sub("\\p{Punct}", " ", tweetLocation) location = re.sub("\\s+", " ", var).lower().strip() if self.locationNameToLocation.has_key(location): return self.locationNameToLocation[location] var = re.sub( "[!\\\"#$%&'\\(\\)\\*\\+-\\./:;<=>\\?@\\[\\\\]^_`\\{\\|\\}~]", " ", tweetLocation) locationWithComma = re.sub("\\s+", " ", var).lower().strip() match = re.search(".+,\\s*(\\w+)", locationWithComma) if match: matchedString = match.group().lower() if self.locationNameToLocation.has_key(matchedString): return self.locationNameToLocation[matchedString] else: items = matchedString.split(',') if len(items) == 2: state = self.extract_state(items[1].strip()) if state: return Location('united states', state, matchedString.split(',')[0], None, None, None, -1, False) else: co = self.extract_country(items[1].strip()) return Location(co, None, None, None, None, None, -1, False) else: co = self.extract_country(items[2].strip()) if co: state = self.extract_state(items[1].strip()) if state: return Location(co, state, items[0].strip(), None, None, None, -1, False) else: return Location(co, None, None, None, None, None, -1, False) else: state = self.extract_state(items[2].strip()) if state: return Location('united states', state, None, None, None, None, -1, False) return None def getLocationForId(self, id): if self.idToLocation.has_key(id): return self.idToLocation[id] def loadLocationToIdFile(self, filename): map = dict() for line in open(filename).readlines(): line = line.lower() splitString = line.split('\t') locationId = int(splitString[0].trim()) for ii in range(len(splitString)): entry = splitString[ii].strip() if map.has_key(entry) and not map[entry] == locationId: logger.warn('Duplicate location found: {}'.format(entry)) map[entry] = locationId return map def loadLocationFile(self, filename): for line in open(filename).readlines(): locationObj = json.loads(line) location = Location.parseLocationFromJsonObj(locationObj) aliases = locationObj['aliases'] self.idToLocation[location.getId()] = location self.locationToId[location] = location.getId() justAddedAliases = [] for alias in aliases: if alias in justAddedAliases: continue if self.locationNameToLocation.has_key(alias): logger.warn("Duplicate location name: {}".format(alias)) else: self.locationNameToLocation[alias] = location justAddedAliases.append(alias) var = re.sub("\\p{Punct}", " ", alias) newEntry = re.sub("\\s+", " ", var) if newEntry in justAddedAliases: continue if newEntry != alias: if self.locationNameToLocation.has_key(newEntry): logger.warn( "Duplicate location name: {}".format(newEntry)) else: self.locationNameToLocation[newEntry] = location justAddedAliases.append(newEntry) def getLocationForPlace(self, country, state, county, city, url, id): location = Location(country, state, county, city, 0, 0, -1, False) if self.locationToId.has_key(location): return self.idToLocation[self.locationToId[location]] location.setUrl(url) location.setTwitterId(id) return location def registerNewLocation(self, location): index = self.newLocationIndex self.newLocationIndex += 1 location.setId(index) self.locationToId[location] = index self.idToLocation[index] = location parent = self.createParentOfLocation(location) if parent: self.locationToParent[location] = parent if not self.locationToChildren.has_key(parent): self.locationToChildren[parent] = [] self.locationToChildren[parent].append(location) def getParent(self, location): if self.locationToParent.has_key(location): return self.locationToParent[location] return None def getChildren(self, location): if self.locationToChildren.has_key(location): return self.locationToChildren[location] return None def lookupLocation(self, location): if self.locationToId.has_key(location): return self.idToLocation[self.locationToId[location]] self.registerNewLocation(location) return location