def get_context(self): logging.info("Ongoing...") for value in self._submissions.values(): try: article = Article(value["submission.url"]) article.download() article.parse() article.nlp() value["article.authors"] = article.authors value["article.text"] = article.text geo = Geoparser() places = geo.geoparse(article.text) countries_in_article = set() for country in places: countries_in_article.add(country["country_predicted"]) for country in countries_in_article: if not str(country) in self._countries: self._countries[str( country)] = value["calculated_score"] else: self._countries[str( country)] += value["calculated_score"] value["article.top_image"] = article.top_image value["article.summary"] = article.summary value["article.keywords"] = article.keywords value["article.countries"] = list(countries_in_article) except Exception as e: logging.info("Error: " + str(e))
def __init__(self): self.geo = Geoparser() long_short_state = states.split('\n') long_short_state = [ent.split('-') for ent in long_short_state] long_short_state[9] = ['Georgia United States', 'GA'] self.state_dict = {a[1]: a[0] for a in long_short_state}
def main(in_file: ("input CSV file"), out_file: ("filename to write ouput to"), city_col: ("column in CSV with city col") = "city", adm1_col: ("column in CSV with state/governorate/ADM1") = "adm1", country_col: ("column in CSV with country name") = "country"): """Geocode a csv with a city, ADM1, and country columns.""" print("Loading Mordecai...") geo = Geoparser() df = pd.read_csv(in_file) geocoded = [] print("Geocoding...") for i in tqdm(df.iterrows()): row = i[1] if pd.isnull(row[adm1_col]): # Elasticsearch doesn't like NaN, change to None adm1 = None else: adm1 = row[adm1_col] res = geo.lookup_city(city = row[city_col], adm1 = adm1, country = row[country_col]) try: gc = {"admin1_code" : res['geo']['admin1_code'], "admin2_code": res['geo']['admin2_code'], "asciiname": res['geo']['asciiname'], "name": res['geo']['name'], "geonameid": res['geo']['geonameid'], "feature_class": res['geo']['feature_class'], "feature_code": res['geo']['feature_code'], "country_code3": res['geo']['country_code3'], "lat": float(res['geo']['coordinates'].split(",")[0]), "lon": float(res['geo']['coordinates'].split(",")[1])} except TypeError: gc = {"admin1_code" : "", "admin2_code": "", "asciiname": "", "name": "", "geonameid": "", "feature_class": "", "feature_code": "", "country_code3": "", "lat": "", "lon": ""} gc['search_city'] = row[city_col] gc['search_adm1'] = row[adm1_col] gc['search_country'] = row[country_col] gc["info"] = res['info'] gc["reason"] = res['reason'] geocoded.append(gc) geo_df = pd.DataFrame(geocoded) geo_df.to_csv(out_file) print("Wrote file out to ", out_file)
def locations_df(csv_file="", sep='\t', directory=False): ''' Input: Pandas DataFrame Output: Pandas DataFrame w/ locs column ''' def parse_tweet(data, text=False, df_column="Full_Text"): ''' Input: Pandas DataFrame or str Output: List of locations for data provided ''' if (text == False): locations = geo.geoparse(data[df_column]) else: locations = geo.geoparse(data) loc_list = {} if locations: for loc in locations: try: if (loc['country_predicted'] == "USA"): loc_list[loc['geo']['place_name']] = ( loc['geo']['lat'], loc['geo']['lon']) except: continue return loc_list else: return np.nan # Spin up geoparser from mordecai try: geo = Geoparser() except ConnectionRefusedError: assert "ConnectionRefusedError: Is the Docker image running?" except Exception as e: print(e) if (directory): data_files = os.listdir(csv_file) for file in data_files: tweet_df = pd.read_csv(csv_file, sep=sep) tqdm.pandas() tweet_df['locs'] = tweet_df.progress_apply(parse_tweet, axis=1) tweet_df.to_csv(file[-4:] + "_mord.csv") return "Process Complete" else: # Map locations to text tweet_df = pd.read_csv(csv_file, sep=sep) tqdm.pandas() tweet_df['locs'] = tweet_df[0:100].progress_apply(parse_tweet, axis=1) return tweet_df
def search_geolocation(message): paste = Paste.Paste(message) content = paste.get_p_content() # Load Geoparser geo = Geoparser() geolocation = geo.geoparse(message) # regex ton find latitude and longitude reg_lat = re.compile(r'(\'lat\': \'([-\d.]+)\',)') reg_lon = re.compile(r'(\'lon\': \'([-\d.]+)\',)') #lat = set(reg_lat.findall(content)) #lon = set(reg_lat.findall(content)) lat = reg_lat.search(message).group(2) lon = reg_lon.search(message).group(2) print('latitude: {}'.format(lat)) print('longitude: {}'.format(lon)) print('{} text geolocation'.format(paste.p_name)) publisher.warning('{} contains geolocation'.format(paste.p_name))
def map_web_locations(web_df, column_name="Paragraphs"): ''' Input: Pandas DataFrame Output: Pandas DataFrame w/ Website Locations Mapped ''' geo = Geoparser() web_df['Para_Locs'] = web_df.apply(parse_web_data, column_name=column_name, geoparser=geo, axis=1) return web_df
class VCGeotagger: def __init__(self): self.geo = Geoparser() long_short_state = states.split('\n') long_short_state = [ent.split('-') for ent in long_short_state] long_short_state[9] = ['Georgia United States', 'GA'] self.state_dict = {a[1]: a[0] for a in long_short_state} def remove_non_ascii(self,text): return ''.join(i for i in text if ord(i)<128) def geotag(self, text): text = self.remove_non_ascii(text) result = self.geo.geoparse(text) if not result: return "None", "None" for r in result: if r['country_predicted'] == 'USA' and 'geo' in r: state = r['geo']['admin1'] city = r['geo']['place_name'] if state != city: return city, state else: return "None", state def placetag(self, text): tmp_list = text.split(', ') if len(tmp_list) == 2: if tmp_list[1] in self.state_dict: state = self.state_dict[tmp_list[1]] else: state = tmp_list[1] city = tmp_list[0] else: state = 'None' city = 'None' return city, state
def map_web_locations(web_df, sep="\t", output_dir='', column_name="Paragraphs", port=9200, host='127.0.0.1'): ''' Given a DataFrame generated above, this outputs a file named 'scraped_website_data_locs.csv' and returns DataFrame with associated information which can be used to generate maps. Parameters ---------- web_df: Pandas DataFrame sep: str The delimeter for outputted CSV data. output_dir: str Output directory of files generated. column_name: str The column name to parse for locations. port: int Port to run location extractor. host: str The hostname to run location extractor. Returns ------- : Pandas DataFrame Pandas DataFrame with 'Web_Locs' as a column, which contain parsed website locations. ''' tqdm.pandas() geo = Geoparser(es_port=int(port), es_host=host) web_df['Web_Locs'] = web_df.progress_apply(parse_web_data, column_name=column_name, geoparser=geo, axis=1) web_df.to_csv(os.path.join(output_dir, 'scraped_website_data_locs.csv'), sep=sep, index=False) return web_df
from mordecai import Geoparser import pandas as pd import logging import glob import os import sys import traceback logging.basicConfig(level=logging.INFO) # geo = Geoparser(es_hosts=['192.168.1.187']) geo = Geoparser() logging.info("GEO CONN: " + str(geo.conn)) def parse_geo(sentence, lat, lon, placename, state, country): geodict = {} geodict['lon'] = lon geodict['lat'] = lat geodict['placename'] = placename geodict['statename'] = state geodict['countrycode'] = country # only look if this event has not been already geoprocessed if '$' in str(placename): geodict = find_geo(sentence) else: logging.info("Sentence already located")
from mordecai import Geoparser import re import os xmlfile = re.compile('.*\.xml') capital = re.compile('.*[A-Z]*.*') #geo = Geoparser() for inputfile in os.listdir("../processed_files"): name, extension = os.path.splitext(inputfile) outfilename = name + "_output.txt" inputfile = "../processed_files/" + inputfile print("Outfile name: " + outfilename) if xmlfile.match(inputfile) and outfilename not in os.listdir( "../geoparser_output"): # Only process XML files geo = Geoparser() with open(inputfile, "r") as infile: print("Processing data from " + inputfile + "...") data = infile.readlines() infile.close() output = geo.geoparse(str(data)) outfilename = "../geoparser_output/" + outfilename with open(outfilename, "a") as outfile: for word in output: if capital.match( word['word'] ): # Filter out place names that don't contain any capital later (Comment out to remove filter) outfile.write(str(word)) outfile.write("\n")
from mordecai import Geoparser geo = Geoparser() print(geo.geoparse("Retencions a la B-23, Barcelona."))
def locations_df(csv_file, sep="\t", directory=False, port=9200, host='127.0.0.1', output_filename='pypack_parsed_locations.csv', output_dir='', df_column="Full_Text"): ''' Pass in a CSV file and recieve another CSV file with locations parsed from whatever column is selected for 'column_name'. Parameters ---------- csv_file: float / str [-90 90] in decimal or DMS (degrees:minutes:seconds) Examples: 38.26 or 38:15:36N sep: float / str [-180 180] in decimal or DMS (degrees:minutes:seconds) Examples: -77.51 or 77:30:36W directory: boolean Examples: True / False port: int Examples: 8888, Default: 9200 host: str Examples: 127.0.0.1 port: str Examples: 'pypack_parsed_locations.csv' output_dir: str Examples: '/User/Desktop' df_column: str Examples: 'json' / 'jsonp' / 'xml' Returns ------- : dataframe Pandas DataFrame with the column 'locs' ''' def parse_tweet(data, geoparser, text=False, USA_Only=False, df_column=df_column): """ A Helper function to locations_df, this function does the parsing of locations and outputs the necessary columns/files. Parameters ---------- data: str Text data to be parsed for locations. Examples: "Flagstaff is a city in Arizona." geoparser: function references Mordecai Geoparser Examples: geo = Geoparser(es_port=int(port), es_hosts=(host)) text: boolean Examples: True / False USA_Only: boolean Examples: True / False df_column: str Examples: 127.0.0.1 Returns ------- : list List of locations parsed from a column in the CSV passed in. """ if(text==False): locations = geoparser.geoparse(str(data[df_column])) else: locations = geoparser.geoparse(str(data)) loc_list = {} if locations: for loc in locations: try: if(USA_Only): if(loc['country_predicted'] == "USA"): loc_list[loc['geo']['place_name']] = (loc['geo']['lat'], loc['geo']['lon']) else: loc_list[loc['geo']['place_name']] = (loc['geo']['lat'], loc['geo']['lon']) except: continue return loc_list else: return np.nan # Spin up geoparser from mordecai try: geo = Geoparser(es_port=int(port), es_hosts=(host)) except Exception as e: print(e) print('Try running locations.start_docker') assert "Geoparser was unable to run, check port and hostname and make sure Docker is running" # Making a progress bar tqdm.pandas() if(directory): data_files = os.listdir(csv_file) for file in data_files: tweet_df = pd.read_csv(csv_file, sep=sep) tweet_df['locs'] = tweet_df.progress_apply(parse_tweet, geoparser=geo, axis = 1) tweet_df.to_csv(os.path.join(output_dir, file[-4:] + "_locations.csv") + file[-4:], sep='\t', index=False) return "Process Complete" else: # Map locations to text tweet_df = pd.read_csv(csv_file, sep=sep) tweet_df['locs'] = tweet_df.progress_apply(parse_tweet, geoparser=geo, axis = 1) tweet_df.to_csv(os.path.join(output_dir, output_filename), sep='\t', index=False) print("Succesfully Parsed: {}".format(csv_file)) return tweet_df
import csv from mordecai import Geoparser from functools import reduce import json geo = Geoparser() ALL_Location = [] run_function = lambda x, y: x if y in x else x + [y] with open('data/covid_19.csv', errors="ignore") as f: Reader = csv.DictReader(f) for row in Reader: a = row["abstract"] t = row["title"] j = row["journal"] url = row["url"] try: geoINF = geo.geoparse(a) except: continue for i in range(len(geoINF)): location = { 'word': '', 'place_name': '', 'country': '', 'lat': '', 'lon': '', 'title': t, 'journal': j, 'url': url }