def main(year_str): fields = jsonfiles.read("../json/fields.json") year = int(year_str) print 'Getting articles for the year ' + str(year) results = [] # make request once to get the number of pages in the response querystring = get_querystring(year, fields, 0) r = requests.get(querystring).json() results = results + r["results"] pages = r["total"] / 10 # make request for remaining pages # write out every 1000 requests i = 1 last_start = 1 for p in xrange(last_start, pages + 1): # try until request succeeds while True: try: querystring = get_querystring(year, fields, p) r = requests.get(querystring).json() results = results + r["results"] print ("request #%d - first title: %s") % (p, r["results"][0]["title"]) except ValueError as e: print e print "retrying ..." time.sleep(1.0) continue except KeyError as e: print e print 'Skipping ', str(p) break if p % 1000 == 0: print "set of 1000 - first title is:\n%s" % results[0]["title"] filename = "".join([ "../json/output/nyt_articles_", str(year), "_", str(i), ".json"]) jsonfiles.write(filename, results) i += 1 # reset results results = [] # sleep so we're not locked out of the API time.sleep(0.08) # write remaining results filename = "".join([ "../json/output/nyt_articles_", str(year), "_", str(i), ".json"]) jsonfiles.write(filename, results)
def main(year): locs_and_articles = jsonfiles.read('../json/output/geocoded_locs_%s.json' % year) coords = {} for loc in locs_and_articles: lat = locs_and_articles[loc]['lat'] lon = locs_and_articles[loc]['lon'] name = loc coords[loc] = {'lat': lat, 'lon': lon, 'name': name} jsonfiles.write('../json/output/place_to_coord_mappings_%s.json' % year, coords)
def main(argv): if len(argv) != 2 or int(argv[0]) < 1980: print "Invalid args: " + str(argv) return year = int(argv[0]) segments = int(argv[1]) print "Joining %d files for %d" % (segments, year) output = [] for i in xrange(1, segments + 1): filename = ("../json/output/nyt_articles_%s_%d.json" % (str(year), i)) r = jsonfiles.read(filename) print type(r) output = output + r print len(output) outfile_name = "../json/output/nyt_articles_%d_all.json" % year jsonfiles.write(outfile_name, output)
def main(filename): jsonfile = jsonfiles.read("../json/world/" + filename) features = jsonfile["features"] new_features = [] for feature in features: geo = feature["geometry"] one = geo["coordinates"][0] two = geo["coordinates"][1] new_geometry = { "type": "Point", "coordinates": [ two, one ] } article = feature["properties"]["article"] new_article = { "date": article["date"], "url": article["url"], "title": article["title"], "nytd_geo_facet": article["nytd_geo_facet"], } new_name = feature["properties"]["name"] new_feature = { "geometry": new_geometry, "type": "Feature", "properties": { "article": new_article, "name": new_name } } new_features.append(new_feature) new_fname = "../json/world/" + filename[:filename.index(".")] + "_fixed.json" jsonfiles.write_min(new_fname, {"type": "FeatureCollection", "features": new_features})
import jsonfiles articles = jsonfiles.read('../json/output/nyt_articles_2012_filtered.json') country_list = jsonfiles.read('../json/output/countries.json') countries = {} for a in articles: geo_facets = a['nytd_geo_facet'] for facet in geo_facets: if facet not in country_list: continue if facet not in countries: countries[facet] = {'articles': [], 'article_count': 0} countries[facet]['articles'].append(a['url']) countries[facet]['article_count'] += 1 jsonfiles.write('../json/output/articles_by_country.json', countries)
def main(year): if not USE_CACHED: print "Beginning ..." # load list of all articles filename = "../json/output/nyt_articles_%s_all.json" % str(year) all_articles = jsonfiles.read(filename) # filter out articles with no nytd_geo_facet property filtered_articles = get_geotagged(all_articles) # write out all articles with a geo_facet filename = "../json/output/nyt_articles_" + str(year) + "_filtered.json" jsonfiles.write(filename, filtered_articles) # get categorized dict of articles # { # "China": [{..}, {..} ... {..}], ... # ... # } locations = categorize(filtered_articles) # locations = categorize_from_local(filtered_articles) # write out articles that have been geocoded jsonfiles.write("../json/output/geocoded_locs_" + str(year) + ".json", locations) else: filtered_articles = jsonfiles.read("../json/output/nyt_articles_" + str(year) + "_filtered.json") locations = jsonfiles.read("../json/output/geocoded_locs_" + str(year) + ".json") # list of the countries in the world countries_dict = jsonfiles.read("../json/output/countries.json") # get the number of articles in each country in descending order counts = [(loc, len(locations[loc]["articles"])) for loc in locations if loc in countries_dict] descending = sorted(counts, key=lambda x: x[1]) descending.reverse() freq = {d[0]: d[1] for d in descending} jsonfiles.write("../json/output/article_freq_by_country.json", freq) # do the same for places counts = [(loc, len(locations[loc]["articles"])) for loc in locations if loc not in countries_dict] descending = sorted(counts, key=lambda x: x[1]) descending.reverse() freq = {d[0]: d[1] for d in descending} jsonfiles.write("../json/output/article_freq_by_place.json", freq) countries_geojson = {"type": "FeatureCollection", "features": []} places_geojson = {"type": "FeatureCollection", "features": []} g = geocoders.GoogleV3() for loc in locations: for article in locations[loc]["articles"]: feature = get_feature(article, locations[loc]) if not feature: continue if loc in countries_dict: countries_geojson["features"].append(feature) else: places_geojson["features"].append(feature) print ( "%d article matches for countries and %d matches for places" % (len(countries_geojson["features"]), len(places_geojson["features"])) )
import jsonfiles articles = jsonfiles.read('../json/output/nyt_articles_2012_filtered.json') countries = {} for a in articles: try: geo_facets = a['nytd_geo_facet'] except KeyError as e: print e try: geo_facets = a['geo_facet'] except KeyError as e: print e continue for facet in geo_facets: if facet not in countries: countries[facet] = [] countries[facet].append[a['url']]
import jsonfiles import json import pprint country_freq = jsonfiles.read('../json/output/article_freq_by_country.json') place_freq = jsonfiles.read('../json/output/article_freq_by_place.json') articles = jsonfiles.read('../json/output/nyt_articles_2012_filtered.json') coords = jsonfiles.read('../json/output/place_to_coord_mappings.json') def main(): country_geojson = {'type': 'FeatureCollection', 'features': []} place_geojson = {'type': 'FeatureCollection', 'features': []} country_features = {} place_features = {} for a in articles: locs = a['nytd_geo_facet'] for loc in locs: if loc in country_freq: if loc not in country_features: country_features[loc] = get_feature(loc) feat = country_features[loc] # add article and increment article count feat['properties']['articles'].append(a) feat['properties']['article_count'] += 1 elif loc in place_freq: if loc not in place_features: place_features[loc] = get_feature(loc) feat = place_features[loc] # add article and increment article count feat['properties']['articles'].append(a)