示例#1
0
    def get_context(self):
        logging.info("Ongoing...")
        for value in self._submissions.values():
            try:
                article = Article(value["submission.url"])
                article.download()
                article.parse()
                article.nlp()
                value["article.authors"] = article.authors
                value["article.text"] = article.text

                geo = Geoparser()
                places = geo.geoparse(article.text)

                countries_in_article = set()

                for country in places:
                    countries_in_article.add(country["country_predicted"])

                for country in countries_in_article:
                    if not str(country) in self._countries:
                        self._countries[str(
                            country)] = value["calculated_score"]
                    else:
                        self._countries[str(
                            country)] += value["calculated_score"]

                value["article.top_image"] = article.top_image
                value["article.summary"] = article.summary
                value["article.keywords"] = article.keywords
                value["article.countries"] = list(countries_in_article)
            except Exception as e:
                logging.info("Error: " + str(e))
示例#2
0
    def __init__(self):
        self.geo = Geoparser()
        long_short_state = states.split('\n')

        long_short_state = [ent.split('-') for ent in long_short_state]

        long_short_state[9] = ['Georgia United States', 'GA']

        self.state_dict = {a[1]: a[0] for a in long_short_state}
示例#3
0
def main(in_file: ("input CSV file"), 
        out_file: ("filename to write ouput to"), 
        city_col: ("column in CSV with city col") = "city",
         adm1_col: ("column in CSV with state/governorate/ADM1") = "adm1", 
         country_col: ("column in CSV with country name") = "country"):
    """Geocode a csv with a city, ADM1, and country columns."""
    print("Loading Mordecai...")
    geo = Geoparser() 
    df = pd.read_csv(in_file)
    geocoded = []
    print("Geocoding...")
    for i in tqdm(df.iterrows()):
        row = i[1]
        if pd.isnull(row[adm1_col]):
            # Elasticsearch doesn't like NaN, change to None
            adm1 = None
        else:
            adm1 = row[adm1_col] 
        res = geo.lookup_city(city = row[city_col], 
                              adm1 = adm1, 
                              country = row[country_col])
        try:
            gc = {"admin1_code" : res['geo']['admin1_code'],
                  "admin2_code": res['geo']['admin2_code'],
                  "asciiname": res['geo']['asciiname'],
                  "name": res['geo']['name'],
                  "geonameid": res['geo']['geonameid'],
                  "feature_class": res['geo']['feature_class'],
                  "feature_code": res['geo']['feature_code'],
                  "country_code3": res['geo']['country_code3'],
                  "lat": float(res['geo']['coordinates'].split(",")[0]),
                  "lon": float(res['geo']['coordinates'].split(",")[1])}
        except TypeError:
            gc = {"admin1_code" : "",
                  "admin2_code": "",
                  "asciiname": "",
                  "name": "",
                  "geonameid": "",
                  "feature_class": "",
                  "feature_code": "", 
                  "country_code3": "",
                  "lat": "",
                  "lon": ""}
        gc['search_city'] = row[city_col]
        gc['search_adm1'] = row[adm1_col]
        gc['search_country'] = row[country_col]
        gc["info"] = res['info']
        gc["reason"] = res['reason']
        geocoded.append(gc)
    geo_df = pd.DataFrame(geocoded)
    geo_df.to_csv(out_file)
    print("Wrote file out to ", out_file)
示例#4
0
def locations_df(csv_file="", sep='\t', directory=False):
    '''
    Input: Pandas DataFrame

    Output: Pandas DataFrame w/ locs column
    '''
    def parse_tweet(data, text=False, df_column="Full_Text"):
        '''
        Input: Pandas DataFrame or str

        Output: List of locations for data provided
        '''
        if (text == False):
            locations = geo.geoparse(data[df_column])
        else:
            locations = geo.geoparse(data)

        loc_list = {}
        if locations:
            for loc in locations:
                try:
                    if (loc['country_predicted'] == "USA"):
                        loc_list[loc['geo']['place_name']] = (
                            loc['geo']['lat'], loc['geo']['lon'])
                except:
                    continue

            return loc_list
        else:
            return np.nan

    # Spin up geoparser from mordecai
    try:
        geo = Geoparser()

    except ConnectionRefusedError:
        assert "ConnectionRefusedError: Is the Docker image running?"

    except Exception as e:
        print(e)

    if (directory):
        data_files = os.listdir(csv_file)
        for file in data_files:
            tweet_df = pd.read_csv(csv_file, sep=sep)
            tqdm.pandas()
            tweet_df['locs'] = tweet_df.progress_apply(parse_tweet, axis=1)
            tweet_df.to_csv(file[-4:] + "_mord.csv")
        return "Process Complete"
    else:
        # Map locations to text
        tweet_df = pd.read_csv(csv_file, sep=sep)
        tqdm.pandas()
        tweet_df['locs'] = tweet_df[0:100].progress_apply(parse_tweet, axis=1)
        return tweet_df
def search_geolocation(message):
    paste = Paste.Paste(message)
    content = paste.get_p_content()

    # Load Geoparser
    geo = Geoparser()
    geolocation = geo.geoparse(message)

    # regex ton find latitude and longitude
    reg_lat = re.compile(r'(\'lat\': \'([-\d.]+)\',)')
    reg_lon = re.compile(r'(\'lon\': \'([-\d.]+)\',)')

    #lat = set(reg_lat.findall(content))
    #lon = set(reg_lat.findall(content))

    lat = reg_lat.search(message).group(2)
    lon = reg_lon.search(message).group(2)

    print('latitude: {}'.format(lat))
    print('longitude: {}'.format(lon))

    print('{} text geolocation'.format(paste.p_name))
    publisher.warning('{} contains geolocation'.format(paste.p_name))
def map_web_locations(web_df, column_name="Paragraphs"):
    '''
    Input: Pandas DataFrame

    Output: Pandas DataFrame w/ Website Locations Mapped
    '''
    geo = Geoparser()

    web_df['Para_Locs'] = web_df.apply(parse_web_data,
                                       column_name=column_name,
                                       geoparser=geo,
                                       axis=1)

    return web_df
示例#7
0
class VCGeotagger:
    def __init__(self):
        self.geo = Geoparser()
        long_short_state = states.split('\n')

        long_short_state = [ent.split('-') for ent in long_short_state]

        long_short_state[9] = ['Georgia United States', 'GA']

        self.state_dict = {a[1]: a[0] for a in long_short_state}

    def remove_non_ascii(self,text):
        return ''.join(i for i in text if ord(i)<128)

    def geotag(self, text):
        text = self.remove_non_ascii(text)
        result = self.geo.geoparse(text)
        if not result:
            return "None", "None"
        for r in result:
            if r['country_predicted'] == 'USA' and 'geo' in r:
                state = r['geo']['admin1']
                city = r['geo']['place_name'] 
                if state != city:
                    return city, state 
                else:
                    return "None", state

    def placetag(self, text):
        tmp_list = text.split(', ')
        if len(tmp_list) == 2:
            if tmp_list[1] in self.state_dict:
                state = self.state_dict[tmp_list[1]]
            else:
                state = tmp_list[1]
            city = tmp_list[0]
        else:
            state = 'None'
            city = 'None'
        return city, state
        
示例#8
0
def map_web_locations(web_df, sep="\t", output_dir='', column_name="Paragraphs", port=9200, host='127.0.0.1'):
    '''
    Given a DataFrame generated above, this outputs a file named 'scraped_website_data_locs.csv'
    and returns DataFrame with associated information which can be used to generate maps.

    Parameters
    ----------
    web_df: Pandas DataFrame

    sep: str
            The delimeter for outputted CSV data.

    output_dir: str
            Output directory of files generated.

    column_name: str
            The column name to parse for locations.

    port: int
            Port to run location extractor.

    host: str
            The hostname to run location extractor.

    Returns
    -------
    : Pandas DataFrame
      Pandas DataFrame with 'Web_Locs' as a column, which contain parsed website locations.

    '''
    tqdm.pandas()

    geo = Geoparser(es_port=int(port), es_host=host)

    web_df['Web_Locs'] = web_df.progress_apply(parse_web_data, column_name=column_name, geoparser=geo, axis=1)
    web_df.to_csv(os.path.join(output_dir, 'scraped_website_data_locs.csv'), sep=sep, index=False)
    return web_df
示例#9
0
from mordecai import Geoparser
import pandas as pd
import logging
import glob
import os
import sys
import traceback
logging.basicConfig(level=logging.INFO)



# geo = Geoparser(es_hosts=['192.168.1.187'])
geo = Geoparser()
logging.info("GEO CONN: " + str(geo.conn))


def parse_geo(sentence, lat, lon, placename, state, country):

    geodict = {}
    geodict['lon'] = lon
    geodict['lat'] = lat
    geodict['placename'] = placename
    geodict['statename'] = state
    geodict['countrycode'] = country
    
    # only look if this event has not been already geoprocessed
    if '$' in str(placename):
        geodict = find_geo(sentence)
    else:
        logging.info("Sentence already located")
    
示例#10
0
from mordecai import Geoparser
import re
import os

xmlfile = re.compile('.*\.xml')
capital = re.compile('.*[A-Z]*.*')
#geo = Geoparser()

for inputfile in os.listdir("../processed_files"):
    name, extension = os.path.splitext(inputfile)
    outfilename = name + "_output.txt"
    inputfile = "../processed_files/" + inputfile
    print("Outfile name: " + outfilename)
    if xmlfile.match(inputfile) and outfilename not in os.listdir(
            "../geoparser_output"):  # Only process XML files
        geo = Geoparser()
        with open(inputfile, "r") as infile:
            print("Processing data from " + inputfile + "...")
            data = infile.readlines()
        infile.close()

        output = geo.geoparse(str(data))
        outfilename = "../geoparser_output/" + outfilename

        with open(outfilename, "a") as outfile:
            for word in output:
                if capital.match(
                        word['word']
                ):  # Filter out place names that don't contain any capital later (Comment out to remove filter)
                    outfile.write(str(word))
                    outfile.write("\n")
示例#11
0
from mordecai import Geoparser
geo = Geoparser()
print(geo.geoparse("Retencions a la B-23, Barcelona."))
示例#12
0
def locations_df(csv_file, sep="\t", directory=False, port=9200, host='127.0.0.1', output_filename='pypack_parsed_locations.csv', output_dir='', df_column="Full_Text"):
    '''
    Pass in a CSV file and recieve another CSV file with locations parsed
    from whatever column is selected for 'column_name'.

    Parameters
    ----------
    csv_file: float / str
              [-90 90] in decimal or DMS (degrees:minutes:seconds)
              Examples: 38.26 or 38:15:36N

    sep: float / str
               [-180 180] in decimal or DMS (degrees:minutes:seconds)
               Examples: -77.51 or 77:30:36W

    directory: boolean
            Examples: True / False

    port: int
            Examples: 8888, Default: 9200

    host: str
            Examples: 127.0.0.1

    port: str
            Examples: 'pypack_parsed_locations.csv'

    output_dir: str
            Examples: '/User/Desktop'

    df_column: str
            Examples: 'json' / 'jsonp' / 'xml'


    Returns
    -------
    : dataframe
      Pandas DataFrame with the column 'locs'

    '''

    def parse_tweet(data, geoparser, text=False, USA_Only=False, df_column=df_column):
        """

        A Helper function to locations_df, this function does the parsing of locations
        and outputs the necessary columns/files.

        Parameters
        ----------
        data: str
                  Text data to be parsed for locations.
                  Examples: "Flagstaff is a city in Arizona."

        geoparser: function references
                   Mordecai Geoparser
                   Examples: geo = Geoparser(es_port=int(port), es_hosts=(host))

        text: boolean
                Examples: True / False

        USA_Only: boolean
                Examples: True / False

        df_column: str
                Examples: 127.0.0.1

        Returns
        -------
        : list
          List of locations parsed from a column in the CSV passed in.

        """
        if(text==False):
            locations = geoparser.geoparse(str(data[df_column]))
        else:
            locations = geoparser.geoparse(str(data))

        loc_list = {}

        if locations:
            for loc in locations:
                try:
                    if(USA_Only):
                        if(loc['country_predicted'] == "USA"):
                            loc_list[loc['geo']['place_name']] = (loc['geo']['lat'], loc['geo']['lon'])
                    else:
                        loc_list[loc['geo']['place_name']] = (loc['geo']['lat'], loc['geo']['lon'])
                except:
                    continue

            return loc_list
        else:
            return np.nan

    # Spin up geoparser from mordecai
    try:
        geo = Geoparser(es_port=int(port), es_hosts=(host))

    except Exception as e:
        print(e)
        print('Try running locations.start_docker')
        assert "Geoparser was unable to run, check port and hostname and make sure Docker is running"

    # Making a progress bar
    tqdm.pandas()

    if(directory):
        data_files = os.listdir(csv_file)
        for file in data_files:
            tweet_df = pd.read_csv(csv_file, sep=sep)
            tweet_df['locs'] = tweet_df.progress_apply(parse_tweet, geoparser=geo, axis = 1)
            tweet_df.to_csv(os.path.join(output_dir, file[-4:] + "_locations.csv") + file[-4:], sep='\t', index=False)
        return "Process Complete"
    else:
        # Map locations to text
        tweet_df = pd.read_csv(csv_file, sep=sep)
        tweet_df['locs'] = tweet_df.progress_apply(parse_tweet, geoparser=geo, axis = 1)
        tweet_df.to_csv(os.path.join(output_dir, output_filename), sep='\t', index=False)
        print("Succesfully Parsed: {}".format(csv_file))
        return tweet_df
import csv
from mordecai import Geoparser
from functools import reduce
import json

geo = Geoparser()
ALL_Location = []
run_function = lambda x, y: x if y in x else x + [y]

with open('data/covid_19.csv', errors="ignore") as f:
    Reader = csv.DictReader(f)
    for row in Reader:
        a = row["abstract"]
        t = row["title"]
        j = row["journal"]
        url = row["url"]
        try:
            geoINF = geo.geoparse(a)
        except:
            continue
        for i in range(len(geoINF)):
            location = {
                'word': '',
                'place_name': '',
                'country': '',
                'lat': '',
                'lon': '',
                'title': t,
                'journal': j,
                'url': url
            }