示例#1
0
def main(year_str):
	fields = jsonfiles.read("../json/fields.json")
	year = int(year_str) 
	print 'Getting articles for the year ' + str(year)
	results = []

	# make request once to get the number of pages in the response
	querystring = get_querystring(year, fields, 0)
	r = requests.get(querystring).json()
	results = results + r["results"]
	pages = r["total"] / 10

	# make request for remaining pages
	# write out every 1000 requests
	i = 1
	last_start = 1 
	for p in xrange(last_start, pages + 1):
		# try until request succeeds
		while True:
			try:
				querystring = get_querystring(year, fields, p)
				r = requests.get(querystring).json()
				results = results + r["results"]
				print ("request #%d - first title: %s") % (p, r["results"][0]["title"])
			except ValueError as e:
				print e
				print "retrying ..."
				time.sleep(1.0)
				continue
			except KeyError as e:
				print e
				print 'Skipping ', str(p)
			break

		if p % 1000 == 0:
			print "set of 1000 - first title is:\n%s" % results[0]["title"]
			filename = "".join([
				"../json/output/nyt_articles_", 
				str(year), 
				"_",
				str(i),
				".json"])
			jsonfiles.write(filename, results)
			i += 1

			# reset results
			results = []

		# sleep so we're not locked out of the API
		time.sleep(0.08)

	# write remaining results
	filename = "".join([
		"../json/output/nyt_articles_", 
		str(year), 
		"_",
		str(i),
		".json"])
	jsonfiles.write(filename, results)
def main(year):
	locs_and_articles = jsonfiles.read('../json/output/geocoded_locs_%s.json' % year)
	coords = {} 
	for loc in locs_and_articles:
		lat = locs_and_articles[loc]['lat']
		lon = locs_and_articles[loc]['lon']
		name = loc
		coords[loc] = {'lat': lat, 'lon': lon, 'name': name}

	jsonfiles.write('../json/output/place_to_coord_mappings_%s.json' % year, coords)
示例#3
0
def main(argv):
	if len(argv) != 2 or int(argv[0]) < 1980:
		print "Invalid args: " + str(argv)
		return

	year = int(argv[0])
	segments = int(argv[1])
	print "Joining %d files for %d" % (segments, year)
	output = []
	for i in xrange(1, segments + 1):
		filename = ("../json/output/nyt_articles_%s_%d.json" % 
					(str(year), i))
		r = jsonfiles.read(filename)
		print type(r)
		output = output + r
		print len(output)

	outfile_name = "../json/output/nyt_articles_%d_all.json" % year
	jsonfiles.write(outfile_name, output)
示例#4
0
def main(filename):
	jsonfile = jsonfiles.read("../json/world/" + filename)
	features = jsonfile["features"]
	new_features = []
	for feature in features:
		geo = feature["geometry"]
		one = geo["coordinates"][0]
		two = geo["coordinates"][1]
		new_geometry = {
			"type": "Point",
			"coordinates": [
				two,
				one
			]
		}

		article = feature["properties"]["article"]
		new_article = {
			"date": article["date"],
			"url": article["url"],
			"title": article["title"],
			"nytd_geo_facet": article["nytd_geo_facet"],
		}
		new_name = feature["properties"]["name"]

		new_feature = {
			"geometry": new_geometry,
			"type": "Feature",
			"properties": {
				"article": new_article,
				"name": new_name
			}
		}
		new_features.append(new_feature)

	new_fname = "../json/world/" + filename[:filename.index(".")] + "_fixed.json"
	jsonfiles.write_min(new_fname, {"type": "FeatureCollection", "features": new_features})
import jsonfiles

articles = jsonfiles.read('../json/output/nyt_articles_2012_filtered.json')
country_list = jsonfiles.read('../json/output/countries.json')

countries = {}
for a in articles:
	geo_facets = a['nytd_geo_facet']
	for facet in geo_facets:
		if facet not in country_list:
			continue
		if facet not in countries:
			countries[facet] = {'articles': [], 'article_count': 0}

		countries[facet]['articles'].append(a['url'])
		countries[facet]['article_count'] += 1

jsonfiles.write('../json/output/articles_by_country.json', countries)
示例#6
0
def main(year):
    if not USE_CACHED:
        print "Beginning ..."
        # load list of all articles
        filename = "../json/output/nyt_articles_%s_all.json" % str(year)
        all_articles = jsonfiles.read(filename)

        # filter out articles with no nytd_geo_facet property
        filtered_articles = get_geotagged(all_articles)

        # write out all articles with a geo_facet
        filename = "../json/output/nyt_articles_" + str(year) + "_filtered.json"
        jsonfiles.write(filename, filtered_articles)

        # get categorized dict of articles
        # {
        #    "China": [{..}, {..} ... {..}], ...
        # 	 ...
        # }
        locations = categorize(filtered_articles)
        # locations = categorize_from_local(filtered_articles)

        # write out articles that have been geocoded
        jsonfiles.write("../json/output/geocoded_locs_" + str(year) + ".json", locations)
    else:
        filtered_articles = jsonfiles.read("../json/output/nyt_articles_" + str(year) + "_filtered.json")
        locations = jsonfiles.read("../json/output/geocoded_locs_" + str(year) + ".json")

        # list of the countries in the world
    countries_dict = jsonfiles.read("../json/output/countries.json")

    # get the number of articles in each country in descending order
    counts = [(loc, len(locations[loc]["articles"])) for loc in locations if loc in countries_dict]
    descending = sorted(counts, key=lambda x: x[1])
    descending.reverse()
    freq = {d[0]: d[1] for d in descending}
    jsonfiles.write("../json/output/article_freq_by_country.json", freq)

    # do the same for places
    counts = [(loc, len(locations[loc]["articles"])) for loc in locations if loc not in countries_dict]
    descending = sorted(counts, key=lambda x: x[1])
    descending.reverse()
    freq = {d[0]: d[1] for d in descending}
    jsonfiles.write("../json/output/article_freq_by_place.json", freq)

    countries_geojson = {"type": "FeatureCollection", "features": []}

    places_geojson = {"type": "FeatureCollection", "features": []}

    g = geocoders.GoogleV3()

    for loc in locations:
        for article in locations[loc]["articles"]:
            feature = get_feature(article, locations[loc])
            if not feature:
                continue
            if loc in countries_dict:
                countries_geojson["features"].append(feature)
            else:
                places_geojson["features"].append(feature)

    print (
        "%d article matches for countries and %d matches for places"
        % (len(countries_geojson["features"]), len(places_geojson["features"]))
    )
import jsonfiles

articles = jsonfiles.read('../json/output/nyt_articles_2012_filtered.json')

countries = {}
for a in articles:
	try:
		geo_facets = a['nytd_geo_facet']
	except KeyError as e:
		print e
		try:
			geo_facets = a['geo_facet']
		except KeyError as e:
			print e
			continue
	for facet in geo_facets:
		if facet not in countries:
			countries[facet] = []

		countries[facet].append[a['url']]
import jsonfiles
import json
import pprint

country_freq = jsonfiles.read('../json/output/article_freq_by_country.json')
place_freq = jsonfiles.read('../json/output/article_freq_by_place.json')
articles = jsonfiles.read('../json/output/nyt_articles_2012_filtered.json')
coords = jsonfiles.read('../json/output/place_to_coord_mappings.json')

def main():

	country_geojson = {'type': 'FeatureCollection', 'features': []}
	place_geojson = {'type': 'FeatureCollection', 'features': []}
	country_features = {}
	place_features = {}
	for a in articles:
		locs = a['nytd_geo_facet']
		for loc in locs:
			if loc in country_freq:
				if loc not in country_features: 
					country_features[loc] = get_feature(loc)
				feat = country_features[loc] 
				# add article and increment article count
				feat['properties']['articles'].append(a)
				feat['properties']['article_count'] += 1
			elif loc in place_freq:
				if loc not in place_features: 
					place_features[loc] = get_feature(loc)
				feat = place_features[loc] 
				# add article and increment article count
				feat['properties']['articles'].append(a)