def check_csv(file_name, year=None, exclude=[]): ''' Check for missing editorials year: check every day in the specified year exclude: list of days to exclude from checking (eg. to exclude Sunday [6]) ''' articles = csv.reader(open(file_name, 'rb'), delimiter=',', quotechar='"') article_dates = [] missing_dates = [] for article in articles: # Get the date date = parse_date(article[6]) article_dates.append(date) article_dates.sort() duplicates = find_duplicates(article_dates) print 'Duplicates: %s' % len(duplicates) if year: start_date = datetime.date(year, 1, 1) end_date = datetime.date(year, 12, 31) else: start_date = article_dates[0] end_date = article_dates[-1] one_day = datetime.timedelta(days=1) this_day = start_date # Loop through each day in specified period to see if there's an article # If not, add to the missing_dates list. while this_day <= end_date: if this_day.weekday() not in exclude: #exclude Sunday if this_day not in article_dates: missing_dates.append(this_day) this_day += one_day print 'Missing: %s' % len(missing_dates) csv_out = csv.DictWriter(open(file_name, 'ab'), extrasaction='ignore', fieldnames=[ 'id', 'title', 'url', 'newspaper_title', 'newspaper_details', 'newspaper_id', 'issue_date', 'page', 'page_url', 'corrections', 'ftext' ], dialect=csv.excel) # Write a results file with nicely formatted dates with open(os.path.join(os.path.dirname(file_name), 'csv_check.html'), 'wb') as results: results.write( '<html>\n<head>\n <title>Results</title>\n</head>\n<body>') results.write('<h2>Duplicates:</h2>\n<table>\n') for dup in duplicates: results.write( '<tr><td>%s</td><td><a href="%s">View issue</a></td></tr>\n' % (format_date(dup), get_issue_url(dup, '35'))) results.write('</table>\n') results.write('<h2>Missing:</h2>\n<table>\n') for missing in missing_dates: results.write( '<tr><td>%s</td><td><a href="%s">View issue</a></td></tr>\n' % (format_date(missing), get_issue_url(missing, '35'))) csv_out.writerow({'issue_date': format_date(missing)}) results.write('</table>\n') results.write('</body>\n</html>')
def check_csv(file_name, year=None, exclude=[]): ''' Check for missing editorials year: check every day in the specified year exclude: list of days to exclude from checking (eg. to exclude Sunday [6]) ''' articles = csv.reader(open(file_name, 'rb'), delimiter=',', quotechar='"') article_dates = [] missing_dates = [] for article in articles: # Get the date date = parse_date(article[6]) article_dates.append(date) article_dates.sort() duplicates = find_duplicates(article_dates) print 'Duplicates: %s' % len(duplicates) if year: start_date = datetime.date(year, 1, 1) end_date = datetime.date(year, 12, 31) else: start_date = article_dates[0] end_date = article_dates[-1] one_day = datetime.timedelta(days=1) this_day = start_date # Loop through each day in specified period to see if there's an article # If not, add to the missing_dates list. while this_day <= end_date: if this_day.weekday() not in exclude: #exclude Sunday if this_day not in article_dates: missing_dates.append(this_day) this_day += one_day print 'Missing: %s' % len(missing_dates) csv_out = csv.DictWriter(open(file_name, 'ab'), extrasaction='ignore', fieldnames=['id', 'title', 'url', 'newspaper_title', 'newspaper_details', 'newspaper_id', 'issue_date', 'page', 'page_url','corrections','ftext'], dialect=csv.excel) # Write a results file with nicely formatted dates with open(os.path.join(os.path.dirname(file_name), 'csv_check.html'), 'wb') as results: results.write('<html>\n<head>\n <title>Results</title>\n</head>\n<body>') results.write('<h2>Duplicates:</h2>\n<table>\n') for dup in duplicates: results.write('<tr><td>%s</td><td><a href="%s">View issue</a></td></tr>\n' % (format_date(dup), get_issue_url(dup, '35'))) results.write('</table>\n') results.write('<h2>Missing:</h2>\n<table>\n') for missing in missing_dates: results.write('<tr><td>%s</td><td><a href="%s">View issue</a></td></tr>\n' % (format_date(missing), get_issue_url(missing, '35'))) csv_out.writerow({'issue_date': format_date(missing)}) results.write('</table>\n') results.write('</body>\n</html>')
def check_editorial(csv_file, year): print csv_file articles = csv.reader(open(csv_file, 'rb'), delimiter=',', quotechar='"') # Things to check: # Duplicate dates # Number of words # Page Number # Missing -- article for each issue title_id = None title_name = None article_count = 0 article_dates = [] articles_filtered = [] odd_pages = [] short_articles = [] long_articles = [] missing_dates = [] for article in articles: article_count += 1 if not title_id: title_id = article[5] if not title_name: title_name = article[3] # Get the date try: page_number = int(article[7]) except ValueError: page_number = int(re.search(r'(\d+)', article[7]).group(1)) article_url = article[2] article_title = article[1] article_date = parse_date(article[6]) text = article[10] word_count = len(text.split()) # Check if page number is odd or even if page_number % 2: odd_pages.append({'url': article_url, 'title': article_title, 'date': article_date.isoformat(), 'page': page_number, 'length': word_count}) else: if word_count < 100: short_articles.append({'url': article_url, 'title': article_title, 'date': article_date.isoformat(), 'page': page_number, 'length': word_count}) elif word_count > 1500: long_articles.append({'url': article_url, 'title': article_title, 'date': article_date.isoformat(), 'page': page_number, 'length': word_count}) else: article_dates.append(article_date.isoformat()) articles_filtered.append({'url': article_url, 'title': article_title, 'date': article_date.isoformat(), 'page': page_number, 'length': word_count}) # Return duplicates duplicate_dates = sorted(set(find_duplicates(article_dates))) # Remove duplicates article_dates = sorted(set(article_dates)) issues = get_title_issues(title_id, year) issue_dates = [issue['date'] for issue in issues] for issue_date in issue_dates: if not issue_date in article_dates: missing_dates.append(issue_date) print 'Total articles: %s' % article_count print 'Odd pages: %s' % len(odd_pages) print 'Short articles: %s' % len(short_articles) print 'Long articles: %s' % len(long_articles) print 'Duplicate dates: %s' % len(duplicate_dates) print 'Found articles: %s' % len(articles_filtered) print 'Missing articles: %s' % len(missing_dates) results = {'id': title_id, 'name': title_name, 'odd': odd_pages, 'short': short_articles, 'long': long_articles, 'articles': articles_filtered, 'duplicates': duplicate_dates, 'missing': missing_dates, 'issues': len(issues) } json_file = '%s/json/%s.js' % (NMA_FOLDER, os.path.basename(csv_file)[:-4]) print json_file with open(json_file, 'wb') as json_data: json.dump(results, json_data, indent=2) return results
def check_editorial(csv_file, year): print csv_file articles = csv.reader(open(csv_file, 'rb'), delimiter=',', quotechar='"') # Things to check: # Duplicate dates # Number of words # Page Number # Missing -- article for each issue title_id = None title_name = None article_count = 0 article_dates = [] articles_filtered = [] odd_pages = [] short_articles = [] long_articles = [] missing_dates = [] for article in articles: article_count += 1 if not title_id: title_id = article[5] if not title_name: title_name = article[3] # Get the date try: page_number = int(article[7]) except ValueError: page_number = int(re.search(r'(\d+)', article[7]).group(1)) article_url = article[2] article_title = article[1] article_date = parse_date(article[6]) text = article[10] word_count = len(text.split()) # Check if page number is odd or even if page_number % 2: odd_pages.append({ 'url': article_url, 'title': article_title, 'date': article_date.isoformat(), 'page': page_number, 'length': word_count }) else: if word_count < 100: short_articles.append({ 'url': article_url, 'title': article_title, 'date': article_date.isoformat(), 'page': page_number, 'length': word_count }) elif word_count > 1500: long_articles.append({ 'url': article_url, 'title': article_title, 'date': article_date.isoformat(), 'page': page_number, 'length': word_count }) else: article_dates.append(article_date.isoformat()) articles_filtered.append({ 'url': article_url, 'title': article_title, 'date': article_date.isoformat(), 'page': page_number, 'length': word_count }) # Return duplicates duplicate_dates = sorted(set(find_duplicates(article_dates))) # Remove duplicates article_dates = sorted(set(article_dates)) issues = get_title_issues(title_id, year) issue_dates = [issue['date'] for issue in issues] for issue_date in issue_dates: if not issue_date in article_dates: missing_dates.append(issue_date) print 'Total articles: %s' % article_count print 'Odd pages: %s' % len(odd_pages) print 'Short articles: %s' % len(short_articles) print 'Long articles: %s' % len(long_articles) print 'Duplicate dates: %s' % len(duplicate_dates) print 'Found articles: %s' % len(articles_filtered) print 'Missing articles: %s' % len(missing_dates) results = { 'id': title_id, 'name': title_name, 'odd': odd_pages, 'short': short_articles, 'long': long_articles, 'articles': articles_filtered, 'duplicates': duplicate_dates, 'missing': missing_dates, 'issues': len(issues) } json_file = '%s/json/%s.js' % (NMA_FOLDER, os.path.basename(csv_file)[:-4]) print json_file with open(json_file, 'wb') as json_data: json.dump(results, json_data, indent=2) return results