def writeCSV(unique_rows, clustered_rows, file_path): u_path = '%s-deduped_unique.csv' % file_path d_path = '%s-deduped.csv' % file_path unique = open(u_path, 'wb') writer = UnicodeCSVDictWriter(unique, unique_rows[0].keys()) writer.writeheader() writer.writerows(unique_rows) unique.close() clusters = open(d_path, 'wb') writer = UnicodeCSVDictWriter(clusters, clustered_rows[0].keys()) writer.writeheader() writer.writerows(clustered_rows) clusters.close() return d_path, u_path
AWS_SECRET = os.environ['AWS_SECRET_KEY'] inp = StringIO() s3_conn = S3Connection(AWS_KEY, AWS_SECRET) bucket = s3_conn.get_bucket('il-elections') k = Key(bucket) k.key = 'Committees.csv' committee_file = k.get_contents_to_file(inp) inp.seek(0) reader = UnicodeCSVDictReader(inp) comm_ids = [c['id'] for c in list(reader)] # Now scrape Officer pages officer_pattern = '/CommitteeDetailOfficers.aspx?id=%s' officer_scraper = OfficerScraper(url_pattern=officer_pattern) officer_scraper.cache_storage = scrapelib.cache.FileCache(cache_dir) officer_scraper.cache_write_only = False officer_header = ['id', 'committee_id', 'name', 'title', 'address'] officer_outp = StringIO() officer_writer = UnicodeCSVDictWriter(officer_outp, officer_header) officer_writer.writeheader() officers = [] for comm_id in comm_ids: for officer in officer_scraper.scrape_one(comm_id): officer['committee_id'] = comm_id officers.append(officer) officer_writer.writerows(officers) k.key = 'Officers.csv' k.set_contents_from_string(officer_outp.getvalue()) k.make_public()
existing = c.fetchall() if not existing: insert = sql_table.insert() headers = t.headers() rows = [dict(zip(headers, row)) for row in t.to_rows()] for row in rows: c.execute(str(insert), row) conn.commit() else: print 'Already saved report %s' % report_data['detail_url'] c.execute('select date_filed from reports order by date_filed limit 1') oldest_year = parser.parse(c.fetchone()[0]).year c.execute('select date_filed from reports order by date_filed desc limit 1') newest_year = parser.parse(c.fetchone()[0]).year c.execute('select * from reports limit 1') header = list(map(lambda x: x[0], c.description)) for year in range(oldest_year, newest_year + 1): oldest_date = '%s-01-01' % year newest_date = '%s-12-31' % year c.execute('select * from reports where date_filed >= ? and date_filed <= ?', (oldest_date, newest_date)) rows = c.fetchall() outp = StringIO() writer = UnicodeCSVWriter(outp) writer.writerow(header) writer.writerows(rows) outp.seek(0) k.key = 'Reports/%s.csv' % year k.set_contents_from_file(outp) k.make_public()
inp = StringIO() s3_conn = S3Connection(AWS_KEY, AWS_SECRET) bucket = s3_conn.get_bucket('il-elections') k = Key(bucket) k.key = 'Committees.csv' committee_file = k.get_contents_to_file(inp) inp.seek(0) reader = UnicodeCSVDictReader(inp) comm_ids = [c['id'] for c in list(reader)] # Now scrape Officer pages officer_pattern = '/CommitteeDetailOfficers.aspx?id=%s' officer_scraper = OfficerScraper(url_pattern=officer_pattern) officer_scraper.cache_storage = scrapelib.cache.FileCache(cache_dir) officer_scraper.cache_write_only = False officer_header = ['id', 'committee_id', 'name', 'title', 'address'] officer_outp = StringIO() officer_writer = UnicodeCSVDictWriter(officer_outp, officer_header) officer_writer.writeheader() officers = [] for comm_id in comm_ids: for officer in officer_scraper.scrape_one(comm_id): officer['committee_id'] = comm_id officers.append(officer) officer_writer.writerows(officers) k.key = 'Officers.csv' k.set_contents_from_string(officer_outp.getvalue()) k.make_public()
from cStringIO import StringIO import os from boto.s3.connection import S3Connection from boto.s3.key import Key from csvkit.unicsv import UnicodeCSVDictWriter AWS_KEY = os.environ['AWS_ACCESS_KEY'] AWS_SECRET = os.environ['AWS_SECRET_KEY'] url_pattern = '/CommitteeDetail.aspx?id=%s' string_on_page = 'ctl00_ContentPlaceHolder1_CommitteeResultsLayout' comm_scraper = CommitteeScraper(url_pattern=url_pattern, string_on_page=string_on_page) # comm_scraper.cache_storage = scrapelib.cache.FileCache('cache') # comm_scraper.cache_write_only = False committees = [] comms_outp = StringIO() comm_header = ['id', 'name', 'type', 'url', 'address', 'status', 'purpose', 'state_id', 'local_id', 'creation_date'] comm_writer = UnicodeCSVDictWriter(comms_outp, comm_header, delimiter='\t') comm_writer.writeheader() for committee in comm_scraper.scrape_all(): # Save to DB and maybe write as JSON? committees.append(committee) comm_writer.writerows(committees) s3_conn = S3Connection(AWS_KEY, AWS_SECRET) bucket = s3_conn.get_bucket('il-elections') k = Key(bucket) k.key = 'Committees.tsv' k.set_contents_from_string(comms_outp.getvalue()) k.make_public()
from boto.s3.key import Key from csvkit.unicsv import UnicodeCSVDictWriter AWS_KEY = os.environ['AWS_ACCESS_KEY'] AWS_SECRET = os.environ['AWS_SECRET_KEY'] url_pattern = '/CommitteeDetail.aspx?id=%s' string_on_page = 'ctl00_ContentPlaceHolder1_CommitteeResultsLayout' comm_scraper = CommitteeScraper(url_pattern=url_pattern, string_on_page=string_on_page) # comm_scraper.cache_storage = scrapelib.cache.FileCache('cache') # comm_scraper.cache_write_only = False committees = [] comms_outp = StringIO() comm_header = [ 'id', 'name', 'type', 'url', 'address', 'status', 'purpose', 'state_id', 'local_id', 'creation_date' ] comm_writer = UnicodeCSVDictWriter(comms_outp, comm_header, delimiter='\t') comm_writer.writeheader() for committee in comm_scraper.scrape_all(): # Save to DB and maybe write as JSON? committees.append(committee) comm_writer.writerows(committees) s3_conn = S3Connection(AWS_KEY, AWS_SECRET) bucket = s3_conn.get_bucket('il-elections') k = Key(bucket) k.key = 'Committees.tsv' k.set_contents_from_string(comms_outp.getvalue()) k.make_public()
insert = sql_table.insert() headers = t.headers() rows = [dict(zip(headers, row)) for row in t.to_rows()] for row in rows: c.execute(str(insert), row) conn.commit() else: print 'Already saved report %s' % report_data['detail_url'] c.execute('select date_filed from reports order by date_filed limit 1') oldest_year = parser.parse(c.fetchone()[0]).year c.execute( 'select date_filed from reports order by date_filed desc limit 1') newest_year = parser.parse(c.fetchone()[0]).year c.execute('select * from reports limit 1') header = list(map(lambda x: x[0], c.description)) for year in range(oldest_year, newest_year + 1): oldest_date = '%s-01-01' % year newest_date = '%s-12-31' % year c.execute( 'select * from reports where date_filed >= ? and date_filed <= ?', (oldest_date, newest_date)) rows = c.fetchall() outp = StringIO() writer = UnicodeCSVWriter(outp) writer.writerow(header) writer.writerows(rows) outp.seek(0) k.key = 'Reports/%s.csv' % year k.set_contents_from_file(outp) k.make_public()