def fct_geoparse(input_url): response = request.urlopen(input_url) merge_answers = response.read().decode( 'utf8' ) # test: {type(input_url)} = <class 'str'>, {len(input_url)} = # # GeoText(merge_answers) -> geotext.geotext.GeoText object. also see spacy, geograpy3 cities = GeoText(merge_answers).cities # case-sensitive [list] country_mentions = GeoText( merge_answers ).country_mentions # OrderedDict([('...', ...), ('...', ...)]) geo_file = input_url.split('/')[-1] cities_file = f'{os.getcwd()}/cities.{geo_file}' os.makedirs(os.path.dirname(cities_file), exist_ok=True) with open(cities_file, 'w') as open_file: json.dump(cities, open_file, indent=2, separators=(',', ': ')) country_mentions_file = f'{os.getcwd()}/country_mentions.{geo_file}' os.makedirs(os.path.dirname(country_mentions_file), exist_ok=True) with open(country_mentions_file, 'w') as open_file: json.dump(country_mentions, open_file, indent=2, separators=(',', ': ')) print('*** fct_geoparse ' + str(datetime.now()) + ' ***') return cities, country_mentions
def retrieve_country(location_description): """ retrieve_country This static method consumes a plain-text description of a location and uses the geotext library to match it to a country. The best match is returned, if there were any (may return None). INPUTS: location_description OUTPUTS: country_code - 2-character ISO-Geocode """ if location_description in ["", None]: return None cdict = GeoText(location_description).country_mentions country_codes = list(cdict.keys()) return country_codes[0] if len(country_codes) > 0 else None
def get_geolocation(self, tweet_object: Dict): text = tweet_object["text"] locations = {"tweet": [], "place": [], "content_cities": [], "content_countries": [], "user_cities": [], "user_countries": []} if tweet_object["coordinates"] != 'None': locations ["tweet"] += [json.loads(tweet_object["coordinates"].replace("'", '"'))["coordinates"]] else: # get geolocation from place if tweet_object["place"] != 'None': try: place = json.loads(tweet_object["place"].replace("'", '"')) shp = shape(place["bounding_box"]) x, y = self._random_point_in_shp(shp) locations["place"] += [[x,y]] except Exception as e: print(f"Error while parsing geolocation from place: {e}") pass # get geolocation from tweet try: text = text.replace("#", "") text = self.camel_to_ws.sub(' ', text) places = GeoText(text) if places.cities: for place in places.cities: coordinates = self._get_location_coordinates(place) if coordinates: locations["content_cities"] += coordinates if places.countries: for place in places.countries: coordinates = self._get_country_coordinates(place) if coordinates: locations["content_countries"] += coordinates except Exception as e: print(f"Error while parsing geolocation from text: {e}") pass # get geolocation from user location if tweet_object["user_location"]: try: places = GeoText(tweet_object["user_location"]) if places.cities: for place in places.cities: coordinates = self._get_location_coordinates(place) if coordinates: locations["user_cities"] += coordinates if places.countries: locations["user_countries"] = [] for place in places.countries: coordinates = self._get_country_coordinates(place) if coordinates: locations["user_countries"] += coordinates except Exception as e: print( f"Error while parsing geolocation from user location: {e}") pass return locations
def get_fixed_locations(self): mix = set(self.original[l]["labelShort"] for l in self.original) countries = set(GeoText(m).countries[0] for m in mix if GeoText(m).countries) cities = set(GeoText(m).cities[0] for m in mix if GeoText(m).cities) us_cities = [ "California", "Chicago", "Clifton", "Dallas", "Denver", "Dulles", "Lincoln", "Los Angeles", "New York", "Oregon", "Orlando", "Phoenix", "San Francisco", "Virginia", ] missing_cities = set( m.split(",")[0] for m in mix if m.split(",")[0] not in cities ) - set(countries) fix = {} for m in mix: fix[m] = {"city": None, "country": None} d = GeoText(m) if d.cities: fix[m]["city"] = d.cities[0] if d.countries: fix[m]["country"] = d.countries[0] for f in fix: for c in missing_cities: if c in f: fix[f]["city"] = c if "UK" in f: fix[f]["country"] = "UK" if fix[f]["city"] == "Amsterdam": fix[f]["country"] = "Netherlands" if fix[f]["city"] == "Brussels": fix[f]["country"] = "Belgium" if fix[f]["city"] == "Dubai": fix[f]["country"] = "UAE" if fix[f]["city"] == "Seoul": fix[f]["country"] = "South Korea" if fix[f]["city"] == "Virginia USA": fix[f]["city"] = "Virginia" if fix[f]["country"] == "Singapore": fix[f]["city"] = "Singapore" if fix[f]["country"] == "Hong Kong": fix[f]["city"] = "Hong Kong" fix[f]["country"] = "China" if fix[f]["city"] in us_cities: fix[f]["country"] = "USA" return fix
def extract_places(tags, destination_map): places = [] for tag in tags: place = tag[0] cities = GeoText(place.upper()).cities if cities.__len__() > 0: for city in cities: places.append(destination_map[city]) return places return places
def get_cities(headlines): d = {} for h in headlines['articles']: cities = GeoText(h['title']).cities + GeoText(h['description']).cities print(cities) for city in cities: if city in d.keys(): d[city] += 1 else: d[city] = 1 print(sorted(d.items(), key=lambda x: x[1], reverse=True)) return d
def find_loc(description): words = description.split(' ') desc = ' '.join([w.capitalize() for w in words]) city = GeoText(description).cities country = GeoText(desc).country_mentions if city and country: return city[0], list(country.keys())[0] elif city and not country: return city[0], None, desc elif country and not city: return None, list(country.keys())[0] else: return None, None, desc
def _location_extract(self): """ if there is no plot, return an empty list using GeoText to find all the cities mentioned in the plot along with the country name that they belong to I assume the country name that got mentioned most is the location of the film :return: a string of country name """ if not self._plot_extract(): return '' places = GeoText(self._plot_extract()[0]).country_mentions if not places: return '' # cities = places.cities return list(places.items())[0][0]
def __parse_city(self): """Parses common HTML elements to identify the referenced city name.""" title = self.content.find("title") if title: cities = GeoText(title.text).cities self.city = cities[0] if cities else None if not self.city: description = self.content.find("meta", attrs={"name": "description"}) if description: cities = GeoText(description.get("content")).cities self.city = cities[0] if cities else None
def weatherFunction(phrase): placeOfWeather = [] text = phrase.title() print(text) for word in text.split(): cities = GeoText(word).cities countries = GeoText(word).countries if cities != []: placeOfWeather.append(cities[0]) if countries != []: placeOfWeather.append(countries[0]) if 'Of' in placeOfWeather: placeOfWeather.remove('Of') return placeOfWeather
def test_read(limit, skip_nationalities, text, cities, countries, nationalities, states, country_mentions): geo_text = GeoText() geo_text.read(text, min_population=limit, skip_nationalities=skip_nationalities) assert set(map(lambda c: c.name, geo_text.results.cities)) == set(cities) assert set(map(lambda c: c.name, geo_text.results.states)) == set(states) assert set(map(lambda c: c.name, geo_text.results.countries)) == set(countries) assert set(map(lambda c: c.name, geo_text.results.nationalities)) == set(nationalities) assert {(k._key, v) for k, v in geo_text.get_country_mentions().items() } == set(country_mentions)
def parse_author_attr(lines): properties = ["Id", "Name", "Affiliation", "Country", "H-index"] split_array = lines[0].split(" ") if(split_array[1].strip("\n") != ""): properties[0] = split_array[1].strip("\n") split_array = lines[1].split(" ", 1) if(split_array[1].strip("\n") != ""): properties[1] = split_array[1].strip("\n") split_array = lines[2].split(" ", 1) if (split_array[1].strip("\n") != ""): properties[2] = split_array[1].strip("\n") places = GeoText(lines[2]) if(len(places.countries) > 0): properties[3] = places.countries[0] #properties[3] = find_country_in_string(lines[2]) split_array = lines[5].split(" ", 1) if (split_array[1].strip("\n") != ""): properties[4] = split_array[1].strip("\n") return properties
def cleandictionary(dictio): print(dictio) cleandictio = {} # role rolekw = getKeywords([dictio['role']]) role = checkKeyWords(list(rolekw), 'role') cleandictio['role'] = role # near nouns = getNouns(dictio['near']) cleandictio['near'] = nouns # located places = GeoText(dictio['located']) cleandictio['located'] = places.cities[0] # size #size = getSize(dictio['numWork']) #cleandictio['size'] = size print(cleandictio) return cleandictio #print(sorted(kdict.items(), key=lambda x: x[1], reverse=True)) #print(kdict) #k = sorted(k, key=lambda x: x[1]) #print(kdict) #cleandictionary(tform.getDataDict())
def get_location(affiliation): """ Function takes an affiliation string and attempts to extract a location. If multiple cities are extracted, all will be included in the final location. If no countries are found, leave it blank. Args: affiliation - Str: Returns: Str: geotext's attempt to extract `City, Country` from affiliation """ places = GeoText(affiliation) if places: city_str = ' '.join( set([city for city in places.cities if city != 'University'])).strip() if len(places.countries) == 0: country_str = '' else: country_str = places.countries[0] locations = f"{city_str}, {country_str}" else: locations = '' return locations
def locations(w): place_string = "" allPlace = {} for i in Indian_states: for m in re.finditer(i, w): allPlace[m.start()] = i p = "" for i in w.split(): p += i + " " if i in short_month.values() or i == 'Date': continue else: if i[0].isupper(): i = i.lower() i = i[0].upper() + i[1:] place = GeoText(i) #print(i) #print(place.cities) #print(place.countries) if not (len(place.cities) == 0): for c in place.cities: allPlace[p.rfind(i)] = c elif not (len(place.countries) == 0): for c in place.countries: allPlace[p.rfind(i)] = c allPlace = OrderedDict(sorted(allPlace.items())) for j in allPlace.values(): place_string += j return allPlace
def check_is_location(loc): loc = loc.title() # converts to camel case places = GeoText(loc) if len(places.cities) > 0: return True else: return False
def update_profile(): if request.method == 'POST': url = request.json['url'] r = requests.get( "http://www.readability.com/api/content/v1/parser?url=" + url + "&token=" + READABILITY_TOKEN) r_json = json.loads(r.text) places_list = GeoText(r_json["content"]) places_set = set(places_list.cities) insert_list = [(str(place).lower(), ) for place in places_set] try: db = MySQLdb.connect(MYSQL_HOST, MYSQL_USER, MYSQL_PASSWORD, MYSQL_DB) cursor = db.cursor() cursor.executemany("INSERT into places VALUES (%s)", insert_list) db.commit() except MySQLdb.Error as e: return "Error Occured" finally: cursor.close() db.close() return str(places_set) else: return "This API only works with POST"
def text_to_location(tweet_text): """Generates a location dictionary (city name and coordinates) given a sentence string. Uses geotext package to search a string for recognised city names contained within it. Then uses the opencage package to retrieve the latitude and longitude coordinates of the given city and forms a dictionary of the city name (key) and a tuple of the coordinates (value) Args: tweet_text (str): Sentence string Returns: Location dict or None: If a recognised city is in the tweet_text, a location is returned. Else, return None Offset: Number of seconds from UTC """ geo = create_geocoder_obj() city_list = GeoText(tweet_text).cities if city_list: city_str = city_list[0] result = geo.geocode(city_str) offset = result[0]["annotations"]["timezone"]["offset_sec"] return { city_str: (result[0]["geometry"]["lat"], result[0]["geometry"]["lng"]) }, offset else: return None, None
def intent(a): words = getWords(a) # --- update khal if a.lower().find('call me') != -1: jdump('name', a[a.lower().find('call me') + 8:]) #plus 8 because len(call me) + 1 return 'Okay, will call you ' + a[a.lower().find('call me') + 8:] # --- "what's my name" if a.lower().find('my name') != -1: return 'I call you ' + khal['name'] # --- find if user is asking for time time_text = ['time'] for each in time_text: if each in words: now = datetime.datetime.now() res = 'It is ' + str(now.hour) + ' ' + str(now.minute) return res # --- weather data weather_text = ['weather'] non_cap_weather = [ 'show', 'tell', 'what', 'is', 'the', 'weather', 'city', 'today', 'right', 'now' ] for each in weather_text: if each in words: potential = GeoText(' '.join( [x for x in words if x not in non_cap_weather])).cities if potential != []: city = potential[0] client = yweather.Client() city = client.fetch_woeid(city) else: city = khal['city_id'] return get_weather(city, 'simple') return a
def list_cities(text): d1 = ['i', 'live', 'in', 'please', 'hi', 'give', 'find', 'who', 'what', 'my', 'hungry', 'near', 'me', 'thank', 'you', \ 'want', 'to', 'eat', 'like','liked', 'I', 'can', 'you', 'suggest', 'of', 'is', 'are', 'near', 'there', 'some', \ 'little', 'now', 'wanna', 'want', 'at', 'on', 'in', 'near', 'area', 'next', 'and', 'how', 'about', 'or', \ 'the', 'a', 'an', 'about', 'for', 'with', 'should', 'could', 'would', 'out','time','person','year','way','day',\ 'thing','man','world','life','hand','part','child','eye','woman','place','work','week', 'doing',\ 'case','point','government','company','number','group','problem','fact','be','have','do','say',\ 'get','make','go','know','take','see','come','think','look','want','give','use','find','tell', 'telling',\ 'ask','work','seem','feel','try','leave','call','good','first','last','long','great','little','own','other',\ 'old','right','big','high','different','small','large','next','early','young','important','few',\ 'public','bad','same','able','to','of','in','for','on','with','at','by','from','up','about','into',\ 'over','after','beneath','under','above','the','and','a','that','I','it','not','he','as','you', \ 'this','but','his','they','her','she','or','an','will','my','one','all','would','there','their', 'talk', \ 'talking', 'love', 'loved', 'hello', 'help', 'helping', 'helped', 'pleasure', 'bye', 'goodbye', 'care', 'later', \ 'no','nothing', 'thanks', 'welcome', 'something', 'hey', 'am'] print text text = text.lower() words = getWords_special_location(text) text = '' for i in words: if i not in d1: text = text + i.title() + ' ' else: text = text + i + ' ' print text from geotext import GeoText places = GeoText(text) print places.cities
def weather(text): start = time.time() if (text == 'block'): data = D2S('weather', 1) data = json.loads(data[0]) else: # title = text.title() places = GeoText(text) location = places.cities # Base Request URL request_url = 'http://api.openweathermap.org/data/2.5/weather?units=imperial&APPID=3857a87bd99d4aecc1dc6e71e489772a' # IF Location if (location): request_url += str('&q=' + str(location)) else: request_url += '&zip=74136,us' # Grab Weather Data req = requests.get(request_url) data = json.loads(req.text) end = time.time() print(end - start) return data
def just_location_plus(self): c = getWords_special_location(self.aliner) a = '' for c_cmall in c: if c_cmall not in self.d1: a = a + c_cmall.title() + ' ' else: a = a + c_cmall + ' ' #print a potentiav = GeoText(a) b1 = potentiav.cities b2 = potentiav.countries #print ('list of potential countries are',b2) c = self.location self.location = [] for ea in c: if ea[1] == 'GPE': self.location.append(ea[2]) if ea[1] == 'TIME': self.time.append(ea[2]) if ea[1] == 'DATE': self.day.append(ea[2]) #print ("len(self.message_text)", len(self.message_text)) self.location = self.location + b1 self.location = self.location + b2
def __call__(self, document): assert isinstance( document, dict), f"wrong input of type {type(document)} to location parser" geo = GeoText(document["text_cleaned"]) for mention in geo.countries + geo.nationalities: # geo.cities if mention.lower() in self.stop_words: continue for match in re.finditer("[\s-]*".join(mention), document["text_cleaned"], re.IGNORECASE): country = document["text"][match.start():match.end()] # non capitalized words result in poor precision if country.capitalize() != country: continue document["entities"][Entity.COUNTRY].add(country) # sort to match longest first sorted_countries = sorted(document["entities"][Entity.COUNTRY], key=lambda country: len(country), reverse=True) for country in sorted_countries: if len(country) < 4: continue self.clean_text(document, country, cased=True) # low precision -> UK, CH etc. # geo = GeoText(document["text"][:document["abstract_start"]]) # for mention in geo.country_mentions: # if mention.lower() in self.stop_words: # continue # for match in re.finditer(mention, document["text"], re.IGNORECASE): # document["entities"][Entity.COUNTRY].add(document["text"][match.start():match.end()]) return document
def calcLocScore(refLat, refLong, target, countriesList, citiesList): targetPlaces = GeoText(target) targetLat = 0 targetLong = 0 scoreList = [] if target and target.strip(): #location which uses short forms for names if (len(targetPlaces.countries) == 0 and len(targetPlaces.cities) == 0): words = [x.strip() for x in re.split(',|/| ', target.upper())] print(words) for data in countriesList: #some data do not have the 3 letter short form if (len(data) < 5): if (data[0] in words): targetLat = float(data[1]) targetLong = float(data[2]) score = locScore(targetLat, targetLong, refLat, refLong) scoreList.append(score) else: if (data[0] in words or data[4] in words): targetLat = float(data[1]) targetLong = float(data[2]) score = locScore(targetLat, targetLong, refLat, refLong) scoreList.append(score) #if theres no match if (len(scoreList) == 0): scoreList.append(0) else: for country in targetPlaces.countries: for data in countriesList: if data[3] == country: targetLat = float(data[1]) targetLong = float(data[2]) score = locScore(targetLat, targetLong, refLat, refLong) break else: score = 0 scoreList.append(score) for city in targetPlaces.cities: for data in citiesList: #capital if data[0] == city or data[7] == city: targetLat = float(data[2]) targetLong = float(data[3]) score = locScore(targetLat, targetLong, refLat, refLong) break else: score = 0 scoreList.append(score) #choose the highest score out of all the scanned countries and cities highestScore = max(scoreList) else: #no location highestScore = 0 return highestScore
def handle_internet_weather_intent(self, message): places = GeoText(message.data.get('utterance')) if places.cities.__len__() == 2: first_city = places.cities.pop(0) second_city = places.cities.pop(0) r = requests.get("https://wondernetwork.com/pings/" + first_city + "/" + second_city) p = re.compile( '\<tr\>\W*\<td class=\"is-bucket is-bucket-0\"\>\W*<div class=\"td-item\"\>([0-9\.]+ms)\</div>\W*\</td>\W*\<td class=\"is-bucket is-bucket-0\"\>\W*<div class=\"td-item\"\>([0-9\.]+ms)\</div>\W*\</td>\W*\<td class=\"is-bucket is-bucket-0\"\>\W*<div class=\"td-item\"\>([0-9\.]+ms)\</div>\W*\</td>\W*\<td class=\"is-bucket is-bucket-0\"\>\W*<div class=\"td-item\"\>([0-9\.]+ms)\</div>\W*\</td>\W*\<td class=\"is-bucket\">([0-9:\- ]+)</td>' ) matches = p.findall(r.content) self.speak("The average response time from " + first_city + " to " + second_city + " was " + matches[0][0] + " at " + matches[0][4]) #self.speak("Debug: The received message was " + message.data.get('utterance')) elif places.cities.__len__() < 2: self.speak("Did you mention a city?") self.speak("I heard the name of " + str(places.cities.__len__()) + " cities.") else: self.speak("You mentioned " + str(places.cities.__len__()) + " cities - I don't know what to do with that!")
def get_cities(raw): """ Returns city names from raw text """ places = GeoText(raw) cities = list(places.cities) return cities
def extract_places(input): cities = [] countries = [] alpha_2 = [] arr_places = [] doc = nlp(input) lists = doc.ents for i in lists: places = GeoText(str(i)) if len(places.country_mentions) != 0: alpha_2 = list(places.country_mentions.items()) for i in range(0, len(alpha_2)): arr_places.append( pycountry.countries.get(alpha_2=alpha_2[i][0]).name) elif len(input) != 0: arr = (re.split(r'\s+', input)) for i in arr: if i == "UK": arr_places.append("United Kingdom") else: name = pycountry.countries.get(alpha_2=str(lists[0])).name arr_places.append(name) return arr_places
def find_locations(tweets): """search text for places""" places = GeoText(tweets).cities places2 = [x for x in places if x in ["San Diego", "Norfolk"]] if len(places2) > 0: return places2[0] return None
def processArticleLocations(news_articles, country): i = 0 numAttempts = 0 while i < len(news_articles): try: article_text = requests.get(news_articles[i]["url"]).text places = GeoText(article_text) cities = [] #countries = [places.index[2][country.lower()] for country in places.countries] countries = [country] for city in places.cities: if countries.__contains__(places.index[1][city.lower()]): cities.append(city) if len(cities) != 0: print(i) most_prob_city = max(cities, key=cities.count) location = gn.geocode(most_prob_city) news_articles[i]["city"] = most_prob_city news_articles[i]["latitude"] = location.latitude news_articles[i]["longitude"] = location.longitude news_articles[i]["country"] = country numAttempts = 0 except: print("exception") if numAttempts < 3: i -= 1 numAttempts += 1 else: numAttempts = 0 i += 1 return news_articles
def get_other_features(text): geof = GeoText(text).country_mentions # sentif = sid.polarity_scores(text) # sentif.pop('neu', None) # # sentif.pop('compound', None) # sentif.update(geof) return geof