def get_issues_by_titles(titles=None): ''' For each title calculate the total number of issues on Trove, and the number of issues for each year. >>> get_issues_by_titles(['32']) The Hobart Town Mercury (Tas. : 1857): 142 issues [{'total_issues': 142, 'title_id': u'32', 'title_name': u'The Hobart Town Mercury (Tas. : 1857)', 'issues_by_year': {u'1857': 142}}] ''' issue_totals = [] title_list = json.load(get_url(TITLES_URL)) for title in title_list: if title['id'] in titles: title_url = '%s%s' % (TITLE_HOLDINGS_URL, title['id']) holdings = json.load(get_url(title_url)) current_year = holdings[0]['y'] totals = {} total = 0 for month in holdings: if current_year != month['y']: current_year = month['y'] try: totals[current_year] += int(month['c']) except KeyError: totals[current_year] = int(month['c']) total += int(month['c']) issue_totals.append({ 'title_id': title['id'], 'title_name': title['name'], 'total_issues': total, 'issues_by_year': totals }) print '%s: %s issues' % (title['name'], total) return issue_totals
def get_issues_by_titles(titles=None): ''' For each title calculate the total number of issues on Trove, and the number of issues for each year. >>> get_issues_by_titles(['32']) The Hobart Town Mercury (Tas. : 1857): 142 issues [{'total_issues': 142, 'title_id': u'32', 'title_name': u'The Hobart Town Mercury (Tas. : 1857)', 'issues_by_year': {u'1857': 142}}] ''' issue_totals = [] title_list = json.load(get_url(TITLES_URL)) for title in title_list: if title['id'] in titles: title_url = '%s%s' % (TITLE_HOLDINGS_URL, title['id']) holdings = json.load(get_url(title_url)) current_year = holdings[0]['y'] totals = {} total = 0 for month in holdings: if current_year != month['y']: current_year = month['y'] try: totals[current_year] += int(month['c']) except KeyError: totals[current_year] = int(month['c']) total += int(month['c']) issue_totals.append({'title_id': title['id'], 'title_name': title['name'], 'total_issues': total, 'issues_by_year': totals}) print '%s: %s issues' % (title['name'], total) return issue_totals
def get_titles(locate=False): ''' Retrieves a list of current newspaper titles from Trove. Retrieves current holdings details about each title. Saves details of newspapers with holdings to a list. Returns a list of dictionaries with the following fields: name, id, state, start_year, start_month, end_year, end_month. ''' title_list = json.load(get_url(TITLES_URL)) titles = [] for title in title_list: name = title['name'] print unicode(name).encode('utf-8') try: place, state = re.search(r'\(([a-zA-Z \.]+, )*?(National|ACT|NSW|NT|Qld|QLD|SA|Tas|TAS|Vic|VIC|WA)\.*?', name).groups() except AttributeError: place = None state = 'national' if locate and place is None and state is not 'national': locate_title(name) url = '%s%s' % (TITLE_HOLDINGS_URL, title['id']) holdings = json.load(get_url(url)) #Only save those who have holdings online if len(holdings) > 0: titles.append({'name': name, 'id': title['id'], 'state': state, 'place': place, 'start_year': holdings[0]['y'], 'start_month': holdings[0]['m'], 'end_year': holdings[-1]['y'], 'end_month': holdings[-1]['m'], }) return titles
def sample_front_pages(size='thumb'): ''' Retrieve a front page image for every title at monthly intervals. ''' titles = json.load(get_url(TITLES_URL)) for title in titles: print 'Processing: %s' % title['name'] directory = '%ssamples/%s' % (HARVEST_DIR, title['id']) if not os.path.exists(directory): os.makedirs(directory) title_url = '%s%s' % (TITLE_HOLDINGS_URL, title['id']) holdings = json.load(get_url(title_url)) for month in holdings: month_url = '%s%s/%s' % (MONTH_ISSUES_URL, month['y'], month['m']) issues = json.load(get_url(month_url)) for issue in issues: if issue['t'] == title['id']: first_issue = issue break first_issue_id = first_issue['iss'] first_issue_date = datetime.date(int(month['y']), int(month['m']), int(first_issue['p'])) print 'Checking date: %s' % first_issue_date.isoformat() page_id = get_front_page_id(first_issue_date, title['id']) filename = '%s/%s-%s-%s.jpg' % (directory, first_issue_date.isoformat(), page_id, size) if not os.path.exists(filename): image = get_front_page_image(None, None, page_id, size=size) print 'Saving: %s' % filename with open(filename, 'wb') as f: f.write(image)
def get_title_issues(title, year): title_url = '%s%s' % (TITLE_HOLDINGS_URL, title) holdings = json.load(get_url(title_url)) issues = [] for month in holdings: if month['y'] == str(year): month_url = '%s%s/%s' % (MONTH_ISSUES_URL, month['y'], month['m']) print month_url month_issues = json.load(get_url(month_url)) for issue in month_issues: if issue['t'] == str(title): issue_date = get_issue_date(issue['iss']) issues.append({'id': issue['iss'], 'date': issue_date.isoformat()}) return issues
def get_issue_url(date, title_id): ''' Gets the issue url given a title and date. >>> get_issue_url(datetime.date(1925,1,1), '35') u'http://trove.nla.gov.au/ndp/del/issue/120168' ''' if type(date) is datetime.date: year, month, day = date.timetuple()[:3] else: year, month, day = (int(num) for num in date.split('-')) data_file = os.path.join(ISSUE_DATA_DIR, '%s-%s.js' % (year, month)) if os.path.exists(data_file): with open(data_file, 'rb') as issue_data: issues = json.load(issue_data) else: url = '%s%s/%02d' % (MONTH_ISSUES_URL, year, month) issues = json.load(get_url(url)) with open(data_file, 'wb') as issue_data: json.dump(issues, issue_data) issue_id = None issue_url = None for issue in issues: if issue['t'] == title_id and int(issue['p']) == day: issue_id = issue['iss'] break if issue_id: issue_url = '%s%s' % (ISSUE_URL, issue_id) else: raise IssueError return issue_url
def get_issue_date(issue_id): issue_url = '%s%s' % (ISSUE_URL, issue_id) response = get_url(issue_url) page = BeautifulSoup(response.read()) issue_date = page.find('div', 'issue').strong.string issue_datetime = parse_date(issue_date) return issue_datetime
def get_issue_url(date, title_id): ''' Gets the issue url given a title and date. >>> get_issue_url(datetime.date(1925,1,1), '35') u'http://trove.nla.gov.au/ndp/del/issue/120168' ''' if type(date) is datetime.date: year, month, day = date.timetuple()[:3] else: year, month, day = (int(num) for num in date.split('-')) url = '%s%s/%02d' % (MONTH_ISSUES_URL, year, month) issues = json.load(get_url(url)) issue_id = None issue_url = None for issue in issues: if issue['t'] == title_id and int(issue['p']) == day: issue_id = issue['iss'] break if issue_id: issue_url = '%s%s' % (ISSUE_URL, issue_id) else: raise IssueError return issue_url
def get_title_issues(title, year): title_url = '%s%s' % (TITLE_HOLDINGS_URL, title) holdings = json.load(get_url(title_url)) issues = [] for month in holdings: if month['y'] == str(year): month_url = '%s%s/%s' % (MONTH_ISSUES_URL, month['y'], month['m']) print month_url month_issues = json.load(get_url(month_url)) for issue in month_issues: if issue['t'] == str(title): issue_date = get_issue_date(issue['iss']) issues.append({ 'id': issue['iss'], 'date': issue_date.isoformat() }) return issues
def get_issue_totals_years(title_id): url = '%s%s/?encoding=json&key=%s&include=years' % (TROVE_TITLE_URL, title_id, TROVE_KEY) print url results = json.load(get_url(url)) issues = {} for year in results['newspaper']['year']: issues[int(year['date'])] = int(year['issuecount']) return issues
def sample_front_pages(size='thumb'): ''' Retrieve a front page image for every title at monthly intervals. ''' titles = json.load(get_url(TITLES_URL)) for title in titles: print 'Processing: %s' % title['name'] directory = '%ssamples/%s-%s' % (HARVEST_DIR, title['id'], title['name']) if not os.path.exists(directory): os.makedirs(directory) title_url = '%s%s' % (TITLE_HOLDINGS_URL, title['id']) holdings = json.load(get_url(title_url)) for month in holdings: month_url = '%s%s/%s' % (MONTH_ISSUES_URL, month['y'], month['m']) issues = json.load(get_url(month_url)) for issue in issues: if issue['t'] == title['id']: first_issue = issue break first_issue_id = first_issue['iss'] first_issue_date = datetime.date(int(month['y']), int(month['m']), int(first_issue['p'])) print 'Checking date: %s' % first_issue_date.isoformat() page_id = get_front_page_id(first_issue_date, title['id']) filename = '%s/%s-%s-%s-%s-p1.jpg' % ( directory, first_issue_id, first_issue_date.isoformat(), page_id, size) if not os.path.exists(filename): image = get_front_page_image(None, None, page_id, size=size) if image: print 'Saving: %s' % filename with open(filename, 'wb') as f: f.write(image)
def get_front_page_url(date, title_id): ''' Gets the url of the front page given a date and a title >>> get_front_page_url(datetime.date(1925,1,1), '35') 'http://trove.nla.gov.au/ndp/del/page/1223077' >>> get_front_page_url('1925-01-01', '35') 'http://trove.nla.gov.au/ndp/del/page/1223077' ''' issue_url = get_issue_url(date, title_id) response = get_url(issue_url) return response.geturl()
def get_front_page_image(date, title_id, page_id=None, size='small'): ''' Retrieves jpg of front page. Small images are about 300px wide. Thumbs are 150px high. ''' if not page_id: page_id = get_front_page_id(date, title_id) if size == 'small': image_url = '%s%s' % (scrape.IMAGE_PATH, page_id) elif size == 'thumb': image_url = '%s%s/thumb' % (scrape.IMAGE_PATH, page_id) response = get_url(image_url) return response.read()
def get_titles(locate=False): ''' Retrieves a list of current newspaper titles from Trove. Retrieves current holdings details about each title. Saves details of newspapers with holdings to a list. Returns a list of dictionaries with the following fields: name, id, state, start_year, start_month, end_year, end_month. ''' title_list = json.load(get_url(TITLES_URL)) titles = [] for title in title_list: name = title['name'] print unicode(name).encode('utf-8') try: place, state = re.search( r'\(([a-zA-Z \.]+, )*?(National|ACT|NSW|NT|Qld|QLD|SA|Tas|TAS|Vic|VIC|WA)\.*?', name).groups() except AttributeError: place = None state = 'national' if locate and place is None and state is not 'national': locate_title(name) url = '%s%s' % (TITLE_HOLDINGS_URL, title['id']) holdings = json.load(get_url(url)) #Only save those who have holdings online if len(holdings) > 0: titles.append({ 'name': name, 'id': title['id'], 'state': state, 'place': place, 'start_year': holdings[0]['y'], 'start_month': holdings[0]['m'], 'end_year': holdings[-1]['y'], 'end_month': holdings[-1]['m'], }) return titles
def get_front_page_image(date, title_id, page_id=None, size='small'): ''' Retrieves jpg of front page. Small images are about 300px wide. Thumbs are 150px high. ''' if not page_id: page_id = get_front_page_id(date, title_id) if size == 'small': image_url = '%s%s' % (scrape.IMAGE_PATH, page_id) elif size == 'thumb': image_url = '%s%s/thumb' % (scrape.IMAGE_PATH, page_id) try: response = get_url(image_url) except HTTPError: return None else: return response.read()
def get_front_page_totals(): categories = { 'Article': 'article', 'Advertising': 'advertising', 'Detailed lists, results, guides': 'lists', 'Family Notices': 'family', 'Literature': 'literature' } output_dir = os.path.join(HARVEST_DIR, 'frontpages') if not os.path.exists(output_dir): os.makedirs(output_dir) #newspapers = [] titles = [] titles_file = os.path.join(output_dir, 'titles.js') results = json.load( get_url('%s?encoding=json&key=%s' % (TROVE_TITLES_URL, TROVE_KEY))) for newspaper_result in results['response']['records']['newspaper']: titles.append([newspaper_result['id'], newspaper_result['title']]) with open(titles_file, 'wb') as titles_js: titles_js.write('var titles = %s;' % json.dumps(titles)) for newspaper_result in results['response']['records']['newspaper']: id = newspaper_result['id'] print 'Processing: %s' % newspaper_result['title'] newspaper_dir = os.path.join(output_dir, id) if not os.path.exists(newspaper_dir): os.makedirs(newspaper_dir) years_file = os.path.join(newspaper_dir, 'year_totals.js') if not os.path.exists(years_file): issues_years = get_issue_totals_years(id) #newspaper['years'] = {} start_date = datetime.date( *map(int, re.split('[^\d]', newspaper_result['startDate']))) end_date = datetime.date( *map(int, re.split('[^\d]', newspaper_result['endDate']))) #for each year get month summaries year_totals = {} num_issues_year = {} for year in range(start_date.year, end_date.year + 1): print 'Year: %s' % year year_totals[year] = {} num_issues_year[year] = 0 num_issues_month = {} year_dir = os.path.join(newspaper_dir, str(year)) if not os.path.exists(year_dir): os.makedirs(year_dir) ''' # First we need to get the number of issues per month url = '%s%s/?encoding=json&key=%s&include=years&range=%s0101-%s1231' % (TROVE_TITLE_URL, newspaper['id'], TROVE_KEY, year, year) results = json.load(get_url(url)) for year_issues in results['newspaper']['year']: if year_issues['date'] == str(year): issues_months = {} for issue in year_issues['issue']: issue_date = datetime.date(*map(int, re.split('[^\d]', issue['date']))) try: issues_months[issue_date.month] += 1 except KeyError: issues_months[issue_date.month] = 1 ''' # Then we can get article details per month year_file = os.path.join(newspaper_dir, '%s.js' % year) if not os.path.exists(year_file): print 'Getting article details...' month_totals = {} for month in range(1, 13): month_totals[month] = {} issue_totals = {} article_list = {} print 'Month: %s' % month month_totals[month] = {} url = '%s&encoding=json&key=%s&q=firstpageseq:1&l-title=%s&l-year=%s&l-month=%02d&reclevel=full&n=100' % ( TROVE_API_URL, TROVE_KEY, id, year, month) results = json.load(get_url(url)) total = int( results['response']['zone'][0]['records']['total']) if total > 0: articles = results['response']['zone'][0][ 'records']['article'] if total > 100: n = 100 s = 0 while n == 100: next_url = '%s&s=%s' % (url, n + s) print next_url results = json.load(get_url(next_url)) s = int(results['response']['zone'][0] ['records']['s']) n = int(results['response']['zone'][0] ['records']['n']) if n > 0: articles.extend( results['response']['zone'][0] ['records']['article']) for article in articles: article_date = datetime.date(*map( int, re.split('[^\d]', article['date']))) #Calculate totals for the month if article['category'] != 'Other': cat = categories[article['category']] try: year_totals[year][cat]['total'] += 1 year_totals[year][cat][ 'words'] += article['wordCount'] except KeyError: year_totals[year][cat] = {} year_totals[year][cat]['total'] = 1 year_totals[year][cat][ 'words'] = article['wordCount'] try: month_totals[month][cat]['total'] += 1 month_totals[month][cat][ 'words'] += article['wordCount'] except KeyError: month_totals[month][cat] = {} month_totals[month][cat]['total'] = 1 month_totals[month][cat][ 'words'] = article['wordCount'] # Calculate totals for each issue try: issue_totals[ article['date']][cat]['total'] += 1 issue_totals[article['date']][cat][ 'words'] += article['wordCount'] except KeyError: try: issue_totals[ article['date']][cat] = {} issue_totals[article['date']][cat][ 'total'] = 1 issue_totals[article['date']][cat][ 'words'] = article['wordCount'] except KeyError: issue_totals[article['date']] = {} issue_totals[ article['date']][cat] = {} issue_totals[article['date']][cat][ 'total'] = 1 issue_totals[article['date']][cat][ 'words'] = article['wordCount'] article_details = { 'date': article['date'], 'heading': article['heading'], 'category': article['category'], 'word_count': article['wordCount'], 'url': article['identifier'] } try: article_list[article['date']][ 'page_url'] = article[ 'trovePageUrl'] except KeyError: article_list[article['date']] = {} article_list[article['date']][ 'page_url'] = article[ 'trovePageUrl'] try: article_list[article['date']][ 'articles'].append(article_details) except KeyError: article_list[ article['date']]['articles'] = [] article_list[article['date']][ 'articles'].append(article_details) for date, details in article_list.items(): with open(os.path.join(year_dir, '%s.js' % date), 'wb') as date_js: json.dump(details, date_js) num_issues_month[month] = len(article_list) num_issues_year[year] += len(article_list) month_file = os.path.join(year_dir, '%s.js' % month) with open(month_file, 'wb') as month_js: json.dump(issue_totals, month_js) ''' for category in categories.values(): total_list = [] words_list = [] for issue, values in issue_totals.items(): try: total_list.append(('Date.UTC(%s, %s, %s)' % (issue.year, issue.month, issue.day), values[category]['total'])) words_list.append(('Date.UTC(%s, %s, %s)' % (issue.year, issue.month, issue.day), values[category]['words'])) except KeyError: total_list.append(('Date.UTC(%s, %s, %s)' % (issue.year, issue.month, issue.day), 0)) words_list.append(('Date.UTC(%s, %s, %s)' % (issue.year, issue.month, issue.day), 0)) month_js.write('var %s_totals = %s;\n' % (category, json.dumps(total_list))) month_js.write('var %s_words = %s;\n' % (category, json.dumps(words_list))) month_js.write('var articles = %s;\n' % json.dumps(article_list)) ''' for month, values in month_totals.items(): num_issues = num_issues_month[month] for cat, totals in values.items(): total = totals['total'] words = totals['words'] if total > 0: totals['total'] = float(total) / num_issues if words > 0: totals['words'] = float(words) / num_issues with open(year_file, 'wb') as year_js: json.dump(month_totals, year_js) ''' for category in categories.values(): total_list = [] words_list = [] for month, values in month_totals.items(): try: total = values[category]['total'] words = values[category]['words'] except KeyError: total = 0 words = 0 num_issues = num_issues_month[month] if total > 0: total = float(total) / num_issues if words > 0: words = float(words) / num_issues total_list.append((month, total)) words_list.append((month, words)) year_js.write('var %s_totals = %s;\n' % (category, json.dumps(total_list))) year_js.write('var %s_words = %s;\n' % (category, json.dumps(words_list))) ''' #print 'No %s' % category # Then we can get articles by month facets ''' print 'Getting totals by month...' newspaper['years'][year]['months'] = {} for category, label in categories.items(): #print url url = '%s&encoding=json&key=%s&q=firstpageseq:1&l-title=%s&l-category=%s&l-year=%s&facet=month&n=0' % (TROVE_API_URL, TROVE_KEY, newspaper['id'], quote_plus(category), year) results = json.load(get_url(url)) try: months = results['response']['zone'][0]['facets']['facet']['term'] except TypeError: months = [] for month_result in months: month = int(month_result['search']) count = float(month_result['count']) if count != 0: try: count = count / issues[month] except KeyError: count = 0 try: newspaper['years'][year]['months'][month][label]['total'] = count except KeyError: try: newspaper['years'][year]['months'][month][label] = {} newspaper['years'][year]['months'][month][label]['total'] = count except KeyError: newspaper['years'][year]['months'][month] = {} newspaper['years'][year]['months'][month][label] = {} newspaper['years'][year]['months'][month][label]['total'] = count try: newspaper['years'][year]['months'][month][label]['words'] = month_totals[month][label]['words'] except KeyError: newspaper['years'][year]['months'][month][label]['words'] = 0 year_file = os.path.join(newspaper_dir, '%s.js' % year) print 'Writing %s' % year_file with open(year_file, 'wb') as year_js: for category in categories.values(): try: totals = [(month, values[category]['total']) for month, values in newspaper['years'][year]['months'].items()] print totals year_js.write('var %s_totals = %s;\n' % (category, json.dumps(totals))) words = [(month, values[category]['words']) for month, values in newspaper['years'][year]['months'].items()] year_js.write('var %s_words = %s;\n' % (category, json.dumps(words))) except KeyError: print 'No %s' % category # for each decade get year summaries print 'Getting totals by year...' start_decade = str(start_date.year)[:3] end_decade = str(end_date.year)[:3] for decade in range(int(start_decade), int(end_decade)+1): for category, label in categories.items(): url = '%s&encoding=json&key=%s&q=firstpageseq:1&l-title=%s&l-category=%s&l-decade=%s&facet=year&n=0' % (TROVE_API_URL, TROVE_KEY, newspaper['id'], quote_plus(category), decade) for num in range(0,10): year = int('%s%s' % (decade, num)) try: newspaper['years'][year][label] = {} except KeyError: newspaper['years'][year] = {} newspaper['years'][year][label] = {} results = json.load(get_url(url)) try: years = results['response']['zone'][0]['facets']['facet']['term'] except TypeError: years = [] for year_result in years: year = int(year_result['display']) count = float(year_result['count']) if count != 0: count = count / newspaper['issues'][year] newspaper['years'][year][label]['total'] = count try: newspaper['years'][year][label]['words'] = year_totals[year][label]['words'] except KeyError: newspaper['years'][year][label]['words'] = 0 print 'Writing %s' % years_file with open(years_file, 'wb') as years_js: for category in categories.values(): try: totals = [(year, values[category]['total']) for year, values in newspaper['years'].items()] years_js.write('var %s_totals = %s;\n' % (category, json.dumps(totals))) words = [(year, values[category]['words']) for year, values in newspaper['years'].items()] years_js.write('var %s_words = %s;\n' % (category, json.dumps(words))) except KeyError: print 'No %s' % category ''' print 'Getting totals for this year...' for year, values in year_totals.items(): num_issues = num_issues_year[year] for cat, totals in values.items(): total = totals['total'] words = totals['words'] if total > 0: totals['total'] = float(total) / num_issues if words > 0: totals['words'] = float(words) / num_issues with open(years_file, 'wb') as years_js: json.dump(year_totals, years_js) '''
def get_front_page_totals(): categories = {'Article': 'article', 'Advertising': 'advertising', 'Detailed lists, results, guides': 'lists', 'Family Notices': 'family', 'Literature': 'literature'} output_dir = os.path.join(HARVEST_DIR, 'frontpages') if not os.path.exists(output_dir): os.makedirs(output_dir) #newspapers = [] titles = [] titles_file = os.path.join(output_dir, 'titles.js') results = json.load(get_url('%s?encoding=json&key=%s' % (TROVE_TITLES_URL, TROVE_KEY))) for newspaper_result in results['response']['records']['newspaper']: titles.append([newspaper_result['id'], newspaper_result['title']]) with open(titles_file, 'wb') as titles_js: titles_js.write('var titles = %s;' % json.dumps(titles)) for newspaper_result in results['response']['records']['newspaper']: id = newspaper_result['id'] print 'Processing: %s' % newspaper_result['title'] newspaper_dir = os.path.join(output_dir, id) if not os.path.exists(newspaper_dir): os.makedirs(newspaper_dir) years_file = os.path.join(newspaper_dir, 'year_totals.js') if not os.path.exists(years_file): issues_years = get_issue_totals_years(id) #newspaper['years'] = {} start_date = datetime.date(*map(int, re.split('[^\d]', newspaper_result['startDate']))) end_date = datetime.date(*map(int, re.split('[^\d]', newspaper_result['endDate']))) #for each year get month summaries year_totals = {} num_issues_year = {} for year in range(start_date.year, end_date.year+1): print 'Year: %s' % year year_totals[year] = {} num_issues_year[year] = 0 num_issues_month = {} year_dir = os.path.join(newspaper_dir, str(year)) if not os.path.exists(year_dir): os.makedirs(year_dir) ''' # First we need to get the number of issues per month url = '%s%s/?encoding=json&key=%s&include=years&range=%s0101-%s1231' % (TROVE_TITLE_URL, newspaper['id'], TROVE_KEY, year, year) results = json.load(get_url(url)) for year_issues in results['newspaper']['year']: if year_issues['date'] == str(year): issues_months = {} for issue in year_issues['issue']: issue_date = datetime.date(*map(int, re.split('[^\d]', issue['date']))) try: issues_months[issue_date.month] += 1 except KeyError: issues_months[issue_date.month] = 1 ''' # Then we can get article details per month year_file = os.path.join(newspaper_dir, '%s.js' % year) if not os.path.exists(year_file): print 'Getting article details...' month_totals = {} for month in range(1, 13): month_totals[month] = {} issue_totals = {} article_list = {} print 'Month: %s' % month month_totals[month] = {} url = '%s&encoding=json&key=%s&q=firstpageseq:1&l-title=%s&l-year=%s&l-month=%02d&reclevel=full&n=100' % (TROVE_API_URL, TROVE_KEY, id, year, month) results = json.load(get_url(url)) total = int(results['response']['zone'][0]['records']['total']) if total > 0: articles = results['response']['zone'][0]['records']['article'] if total > 100: n = 100 s = 0 while n == 100: next_url = '%s&s=%s' % (url, n+s) print next_url results = json.load(get_url(next_url)) s = int(results['response']['zone'][0]['records']['s']) n = int(results['response']['zone'][0]['records']['n']) if n > 0: articles.extend(results['response']['zone'][0]['records']['article']) for article in articles: article_date = datetime.date(*map(int, re.split('[^\d]', article['date']))) #Calculate totals for the month if article['category'] != 'Other': cat = categories[article['category']] try: year_totals[year][cat]['total'] += 1 year_totals[year][cat]['words'] += article['wordCount'] except KeyError: year_totals[year][cat] = {} year_totals[year][cat]['total'] = 1 year_totals[year][cat]['words'] = article['wordCount'] try: month_totals[month][cat]['total'] += 1 month_totals[month][cat]['words'] += article['wordCount'] except KeyError: month_totals[month][cat] = {} month_totals[month][cat]['total'] = 1 month_totals[month][cat]['words'] = article['wordCount'] # Calculate totals for each issue try: issue_totals[article['date']][cat]['total'] += 1 issue_totals[article['date']][cat]['words'] += article['wordCount'] except KeyError: try: issue_totals[article['date']][cat] = {} issue_totals[article['date']][cat]['total'] = 1 issue_totals[article['date']][cat]['words'] = article['wordCount'] except KeyError: issue_totals[article['date']] = {} issue_totals[article['date']][cat] = {} issue_totals[article['date']][cat]['total'] = 1 issue_totals[article['date']][cat]['words'] = article['wordCount'] article_details = {'date': article['date'], 'heading': article['heading'], 'category': article['category'], 'word_count': article['wordCount'], 'url': article['identifier']} try: article_list[article['date']]['page_url'] = article['trovePageUrl'] except KeyError: article_list[article['date']] = {} article_list[article['date']]['page_url'] = article['trovePageUrl'] try: article_list[article['date']]['articles'].append(article_details) except KeyError: article_list[article['date']]['articles'] = [] article_list[article['date']]['articles'].append(article_details) for date, details in article_list.items(): with open(os.path.join(year_dir, '%s.js' % date), 'wb') as date_js: json.dump(details, date_js) num_issues_month[month] = len(article_list) num_issues_year[year] += len(article_list) month_file = os.path.join(year_dir, '%s.js' % month) with open(month_file, 'wb') as month_js: json.dump(issue_totals, month_js); ''' for category in categories.values(): total_list = [] words_list = [] for issue, values in issue_totals.items(): try: total_list.append(('Date.UTC(%s, %s, %s)' % (issue.year, issue.month, issue.day), values[category]['total'])) words_list.append(('Date.UTC(%s, %s, %s)' % (issue.year, issue.month, issue.day), values[category]['words'])) except KeyError: total_list.append(('Date.UTC(%s, %s, %s)' % (issue.year, issue.month, issue.day), 0)) words_list.append(('Date.UTC(%s, %s, %s)' % (issue.year, issue.month, issue.day), 0)) month_js.write('var %s_totals = %s;\n' % (category, json.dumps(total_list))) month_js.write('var %s_words = %s;\n' % (category, json.dumps(words_list))) month_js.write('var articles = %s;\n' % json.dumps(article_list)) ''' for month, values in month_totals.items(): num_issues = num_issues_month[month] for cat, totals in values.items(): total = totals['total'] words = totals['words'] if total > 0: totals['total'] = float(total) / num_issues if words > 0: totals['words'] = float(words) / num_issues with open(year_file, 'wb') as year_js: json.dump(month_totals, year_js) ''' for category in categories.values(): total_list = [] words_list = [] for month, values in month_totals.items(): try: total = values[category]['total'] words = values[category]['words'] except KeyError: total = 0 words = 0 num_issues = num_issues_month[month] if total > 0: total = float(total) / num_issues if words > 0: words = float(words) / num_issues total_list.append((month, total)) words_list.append((month, words)) year_js.write('var %s_totals = %s;\n' % (category, json.dumps(total_list))) year_js.write('var %s_words = %s;\n' % (category, json.dumps(words_list))) ''' #print 'No %s' % category # Then we can get articles by month facets ''' print 'Getting totals by month...' newspaper['years'][year]['months'] = {} for category, label in categories.items(): #print url url = '%s&encoding=json&key=%s&q=firstpageseq:1&l-title=%s&l-category=%s&l-year=%s&facet=month&n=0' % (TROVE_API_URL, TROVE_KEY, newspaper['id'], quote_plus(category), year) results = json.load(get_url(url)) try: months = results['response']['zone'][0]['facets']['facet']['term'] except TypeError: months = [] for month_result in months: month = int(month_result['search']) count = float(month_result['count']) if count != 0: try: count = count / issues[month] except KeyError: count = 0 try: newspaper['years'][year]['months'][month][label]['total'] = count except KeyError: try: newspaper['years'][year]['months'][month][label] = {} newspaper['years'][year]['months'][month][label]['total'] = count except KeyError: newspaper['years'][year]['months'][month] = {} newspaper['years'][year]['months'][month][label] = {} newspaper['years'][year]['months'][month][label]['total'] = count try: newspaper['years'][year]['months'][month][label]['words'] = month_totals[month][label]['words'] except KeyError: newspaper['years'][year]['months'][month][label]['words'] = 0 year_file = os.path.join(newspaper_dir, '%s.js' % year) print 'Writing %s' % year_file with open(year_file, 'wb') as year_js: for category in categories.values(): try: totals = [(month, values[category]['total']) for month, values in newspaper['years'][year]['months'].items()] print totals year_js.write('var %s_totals = %s;\n' % (category, json.dumps(totals))) words = [(month, values[category]['words']) for month, values in newspaper['years'][year]['months'].items()] year_js.write('var %s_words = %s;\n' % (category, json.dumps(words))) except KeyError: print 'No %s' % category # for each decade get year summaries print 'Getting totals by year...' start_decade = str(start_date.year)[:3] end_decade = str(end_date.year)[:3] for decade in range(int(start_decade), int(end_decade)+1): for category, label in categories.items(): url = '%s&encoding=json&key=%s&q=firstpageseq:1&l-title=%s&l-category=%s&l-decade=%s&facet=year&n=0' % (TROVE_API_URL, TROVE_KEY, newspaper['id'], quote_plus(category), decade) for num in range(0,10): year = int('%s%s' % (decade, num)) try: newspaper['years'][year][label] = {} except KeyError: newspaper['years'][year] = {} newspaper['years'][year][label] = {} results = json.load(get_url(url)) try: years = results['response']['zone'][0]['facets']['facet']['term'] except TypeError: years = [] for year_result in years: year = int(year_result['display']) count = float(year_result['count']) if count != 0: count = count / newspaper['issues'][year] newspaper['years'][year][label]['total'] = count try: newspaper['years'][year][label]['words'] = year_totals[year][label]['words'] except KeyError: newspaper['years'][year][label]['words'] = 0 print 'Writing %s' % years_file with open(years_file, 'wb') as years_js: for category in categories.values(): try: totals = [(year, values[category]['total']) for year, values in newspaper['years'].items()] years_js.write('var %s_totals = %s;\n' % (category, json.dumps(totals))) words = [(year, values[category]['words']) for year, values in newspaper['years'].items()] years_js.write('var %s_words = %s;\n' % (category, json.dumps(words))) except KeyError: print 'No %s' % category ''' print 'Getting totals for this year...' for year, values in year_totals.items(): num_issues = num_issues_year[year] for cat, totals in values.items(): total = totals['total'] words = totals['words'] if total > 0: totals['total'] = float(total) / num_issues if words > 0: totals['words'] = float(words) / num_issues with open(years_file, 'wb') as years_js: json.dump(year_totals, years_js) '''