Пример #1
0
def main():

	country_geojson = {'type': 'FeatureCollection', 'features': []}
	place_geojson = {'type': 'FeatureCollection', 'features': []}
	country_features = {}
	place_features = {}
	for a in articles:
		locs = a['nytd_geo_facet']
		for loc in locs:
			if loc in country_freq:
				if loc not in country_features: 
					country_features[loc] = get_feature(loc)
				feat = country_features[loc] 
				# add article and increment article count
				feat['properties']['articles'].append(a)
				feat['properties']['article_count'] += 1
			elif loc in place_freq:
				if loc not in place_features: 
					place_features[loc] = get_feature(loc)
				feat = place_features[loc] 
				# add article and increment article count
				feat['properties']['articles'].append(a)
				feat['properties']['article_count'] += 1
			else:
				print "could not place " + loc

	print type(country_features)
	country_geojson['features'] = [country_features[key] for key in country_features]
	place_geojson['features'] = [place_features[key] for key in place_features]

	jsonfiles.write('../json/output/countries_2012_v3.json', country_geojson)
	jsonfiles.write('../json/output/places_2012_v3.json', place_geojson)
Пример #2
0
def main(year_str):
	fields = jsonfiles.read("../json/fields.json")
	year = int(year_str) 
	print 'Getting articles for the year ' + str(year)
	results = []

	# make request once to get the number of pages in the response
	querystring = get_querystring(year, fields, 0)
	r = requests.get(querystring).json()
	results = results + r["results"]
	pages = r["total"] / 10

	# make request for remaining pages
	# write out every 1000 requests
	i = 1
	last_start = 1 
	for p in xrange(last_start, pages + 1):
		# try until request succeeds
		while True:
			try:
				querystring = get_querystring(year, fields, p)
				r = requests.get(querystring).json()
				results = results + r["results"]
				print ("request #%d - first title: %s") % (p, r["results"][0]["title"])
			except ValueError as e:
				print e
				print "retrying ..."
				time.sleep(1.0)
				continue
			except KeyError as e:
				print e
				print 'Skipping ', str(p)
			break

		if p % 1000 == 0:
			print "set of 1000 - first title is:\n%s" % results[0]["title"]
			filename = "".join([
				"../json/output/nyt_articles_", 
				str(year), 
				"_",
				str(i),
				".json"])
			jsonfiles.write(filename, results)
			i += 1

			# reset results
			results = []

		# sleep so we're not locked out of the API
		time.sleep(0.08)

	# write remaining results
	filename = "".join([
		"../json/output/nyt_articles_", 
		str(year), 
		"_",
		str(i),
		".json"])
	jsonfiles.write(filename, results)
def main(year):
	locs_and_articles = jsonfiles.read('../json/output/geocoded_locs_%s.json' % year)
	coords = {} 
	for loc in locs_and_articles:
		lat = locs_and_articles[loc]['lat']
		lon = locs_and_articles[loc]['lon']
		name = loc
		coords[loc] = {'lat': lat, 'lon': lon, 'name': name}

	jsonfiles.write('../json/output/place_to_coord_mappings_%s.json' % year, coords)
Пример #4
0
def write_output(filtered_articles, countries_geojson, places_geojson, year):
    # write to file
    try:
        # write out countries and the articles that correspond to them
        filename = "../json/output/countries_%s_v2.json" % year
        jsonfiles.write(filename, countries_geojson)
        # write out places and the articles that correspond to them
        filename = "../json/output/places_%s_v2.json" % year
        jsonfiles.write(filename, places_geojson)
    except IOError as e:
        print e
Пример #5
0
def main(argv):
	if len(argv) != 2 or int(argv[0]) < 1980:
		print "Invalid args: " + str(argv)
		return

	year = int(argv[0])
	segments = int(argv[1])
	print "Joining %d files for %d" % (segments, year)
	output = []
	for i in xrange(1, segments + 1):
		filename = ("../json/output/nyt_articles_%s_%d.json" % 
					(str(year), i))
		r = jsonfiles.read(filename)
		print type(r)
		output = output + r
		print len(output)

	outfile_name = "../json/output/nyt_articles_%d_all.json" % year
	jsonfiles.write(outfile_name, output)
Пример #6
0
import csv
import jsonfiles

with open('../json/population_2010.csv', 'rb') as f:
	countries = {}
	lines = []
	for row in csv.reader(f):
		if '2010' in row and 'High variant' in row:
			val = row[3]
			val = int(val.replace('.', ''))

			countries[row[0]] = val

	jsonfiles.write('../json/output/country_populations_2010.json',
	 				countries)
import jsonfiles

articles = jsonfiles.read('../json/output/nyt_articles_2012_filtered.json')
country_list = jsonfiles.read('../json/output/countries.json')

countries = {}
for a in articles:
	geo_facets = a['nytd_geo_facet']
	for facet in geo_facets:
		if facet not in country_list:
			continue
		if facet not in countries:
			countries[facet] = {'articles': [], 'article_count': 0}

		countries[facet]['articles'].append(a['url'])
		countries[facet]['article_count'] += 1

jsonfiles.write('../json/output/articles_by_country.json', countries)
Пример #8
0
def main(year):
    if not USE_CACHED:
        print "Beginning ..."
        # load list of all articles
        filename = "../json/output/nyt_articles_%s_all.json" % str(year)
        all_articles = jsonfiles.read(filename)

        # filter out articles with no nytd_geo_facet property
        filtered_articles = get_geotagged(all_articles)

        # write out all articles with a geo_facet
        filename = "../json/output/nyt_articles_" + str(year) + "_filtered.json"
        jsonfiles.write(filename, filtered_articles)

        # get categorized dict of articles
        # {
        #    "China": [{..}, {..} ... {..}], ...
        # 	 ...
        # }
        locations = categorize(filtered_articles)
        # locations = categorize_from_local(filtered_articles)

        # write out articles that have been geocoded
        jsonfiles.write("../json/output/geocoded_locs_" + str(year) + ".json", locations)
    else:
        filtered_articles = jsonfiles.read("../json/output/nyt_articles_" + str(year) + "_filtered.json")
        locations = jsonfiles.read("../json/output/geocoded_locs_" + str(year) + ".json")

        # list of the countries in the world
    countries_dict = jsonfiles.read("../json/output/countries.json")

    # get the number of articles in each country in descending order
    counts = [(loc, len(locations[loc]["articles"])) for loc in locations if loc in countries_dict]
    descending = sorted(counts, key=lambda x: x[1])
    descending.reverse()
    freq = {d[0]: d[1] for d in descending}
    jsonfiles.write("../json/output/article_freq_by_country.json", freq)

    # do the same for places
    counts = [(loc, len(locations[loc]["articles"])) for loc in locations if loc not in countries_dict]
    descending = sorted(counts, key=lambda x: x[1])
    descending.reverse()
    freq = {d[0]: d[1] for d in descending}
    jsonfiles.write("../json/output/article_freq_by_place.json", freq)

    countries_geojson = {"type": "FeatureCollection", "features": []}

    places_geojson = {"type": "FeatureCollection", "features": []}

    g = geocoders.GoogleV3()

    for loc in locations:
        for article in locations[loc]["articles"]:
            feature = get_feature(article, locations[loc])
            if not feature:
                continue
            if loc in countries_dict:
                countries_geojson["features"].append(feature)
            else:
                places_geojson["features"].append(feature)

    print (
        "%d article matches for countries and %d matches for places"
        % (len(countries_geojson["features"]), len(places_geojson["features"]))
    )