Exemplo n.º 1
0
to finaly write it in a csv file
"""
import pandas as pd
from geonamescache import GeonamesCache
from geonamescache.mappers import country

gc = GeonamesCache()  # we use the GeonamesCache to get the name of countries

# creating a mapper between the iso3 code and the country name
mapper = country(from_key='name', to_key='iso3')
countries = list(gc.get_dataset_by_key(
    gc.get_countries(),
    'name',
).keys())
# for the US we are going to use the states
states = list(gc.get_us_states_by_names())
#print(countries)
# any of these key words could indicate that we are reading about a star
key_words = ['movie', 'film', 'TV', 'television', 'actor', 'actress']
articles = []
dataset = {}

with open('article-per-line.txt', 'r', encoding="utf8") as f:
    articles = f.read().splitlines()

for a in articles:
    dec = a.split('born in', 1)
    proceed = True
    # we still need to optimize and factorize our code for this part
    if len(dec) > 1:
        for s in states:
# with unique city names 6345 (number is not up to date) location string remain unresovled
# with largest city name 4623 location string remain unresovled
# with manually resolved locations 3333 location string remain unresovled

import csv, json, re
from geonamescache import GeonamesCache
from loclists import check_unresolved

unresolved_locations = []
commits_by_countries = {}
countries_by_locstr = {}
gc = GeonamesCache()
countries = gc.get_countries()
countries_by_names = gc.get_countries_by_names()
us_states = gc.get_us_states()
us_states_by_names = gc.get_us_states_by_names()

re_ignore = re.compile(r'[\.\(\)\d-]')
re_ws = re.compile(r'\s{2,}')


def test_locs(locs):
    for loc in locs:
        loc = loc.strip().lower()
        loctitle = loc.title()
        locupper = loc.upper()
        if loc in countries_by_names:
            return loc
        elif loctitle in countries_by_names:
            return loctitle
        elif 2 == len(loc) and locupper in us_states:
# with unique city names 6345 (number is not up to date) location string remain unresovled
# with largest city name 4623 location string remain unresovled
# with manually resolved locations 3333 location string remain unresovled

import csv, json, re
from geonamescache import GeonamesCache
from loclists import check_unresolved

unresolved_locations = []
commits_by_countries = {}
countries_by_locstr = {}
gc = GeonamesCache()
countries = gc.get_countries()
countries_by_names = gc.get_countries_by_names()
us_states = gc.get_us_states()
us_states_by_names = gc.get_us_states_by_names()

re_ignore = re.compile(r'[\.\(\)\d-]')
re_ws = re.compile(r'\s{2,}')


def test_locs(locs):
    for loc in locs:
        loc = loc.strip().lower()
        loctitle = loc.title()
        locupper = loc.upper()
        if loc in countries_by_names:
            return loc
        elif loctitle in countries_by_names:
            return loctitle
        elif 2 == len(loc) and locupper in us_states: