to finaly write it in a csv file """ import pandas as pd from geonamescache import GeonamesCache from geonamescache.mappers import country gc = GeonamesCache() # we use the GeonamesCache to get the name of countries # creating a mapper between the iso3 code and the country name mapper = country(from_key='name', to_key='iso3') countries = list(gc.get_dataset_by_key( gc.get_countries(), 'name', ).keys()) # for the US we are going to use the states states = list(gc.get_us_states_by_names()) #print(countries) # any of these key words could indicate that we are reading about a star key_words = ['movie', 'film', 'TV', 'television', 'actor', 'actress'] articles = [] dataset = {} with open('article-per-line.txt', 'r', encoding="utf8") as f: articles = f.read().splitlines() for a in articles: dec = a.split('born in', 1) proceed = True # we still need to optimize and factorize our code for this part if len(dec) > 1: for s in states:
# with unique city names 6345 (number is not up to date) location string remain unresovled # with largest city name 4623 location string remain unresovled # with manually resolved locations 3333 location string remain unresovled import csv, json, re from geonamescache import GeonamesCache from loclists import check_unresolved unresolved_locations = [] commits_by_countries = {} countries_by_locstr = {} gc = GeonamesCache() countries = gc.get_countries() countries_by_names = gc.get_countries_by_names() us_states = gc.get_us_states() us_states_by_names = gc.get_us_states_by_names() re_ignore = re.compile(r'[\.\(\)\d-]') re_ws = re.compile(r'\s{2,}') def test_locs(locs): for loc in locs: loc = loc.strip().lower() loctitle = loc.title() locupper = loc.upper() if loc in countries_by_names: return loc elif loctitle in countries_by_names: return loctitle elif 2 == len(loc) and locupper in us_states:
# with unique city names 6345 (number is not up to date) location string remain unresovled # with largest city name 4623 location string remain unresovled # with manually resolved locations 3333 location string remain unresovled import csv, json, re from geonamescache import GeonamesCache from loclists import check_unresolved unresolved_locations = [] commits_by_countries = {} countries_by_locstr = {} gc = GeonamesCache() countries = gc.get_countries() countries_by_names = gc.get_countries_by_names() us_states = gc.get_us_states() us_states_by_names = gc.get_us_states_by_names() re_ignore = re.compile(r'[\.\(\)\d-]') re_ws = re.compile(r'\s{2,}') def test_locs(locs): for loc in locs: loc = loc.strip().lower() loctitle = loc.title() locupper = loc.upper() if loc in countries_by_names: return loc elif loctitle in countries_by_names: return loctitle elif 2 == len(loc) and locupper in us_states: