def download_csv(): """ Generate a data download. """ f = cStringIO.StringIO() writer = UnicodeCSVDictWriter(f, [ 'lobbyist_first_name', 'lobbyist_last_name', 'report_period', 'recipient_name', 'recipient_type', 'legislator_first_name', 'legislator_last_name', 'legislator_office', 'legislator_party', 'legislator_district', 'event_date', 'category', 'description', 'cost', 'organization_name', 'organization_industry', 'group', 'ethics_board_id', 'is_solicitation' ]) writer.writeheader() expenditures = Expenditure.select() for ex in expenditures: row = { 'lobbyist_first_name': ex.lobbyist.first_name, 'lobbyist_last_name': ex.lobbyist.last_name, 'report_period': ex.report_period, 'recipient_name': ex.recipient, 'recipient_type': ex.recipient_type, 'legislator_first_name': ex.legislator.first_name if ex.legislator else None, 'legislator_last_name': ex.legislator.last_name if ex.legislator else None, 'legislator_office': ex.legislator.office if ex.legislator else None, 'legislator_party': ex.legislator.party if ex.legislator else None, 'legislator_district': ex.legislator.district if ex.legislator else None, 'event_date': ex.event_date, 'category': ex.category, 'description': ex.description, 'cost': ex.cost, 'organization_name': ex.organization.name, 'organization_industry': ex.organization.category, 'group': ex.group.name if ex.group else None, 'ethics_board_id': ex.ethics_id, 'is_solicitation': ex.is_solicitation } writer.writerow(row) return f.getvalue().decode('utf-8')
def writeCSV(unique_rows, clustered_rows, file_path): u_path = '%s-deduped_unique.csv' % file_path d_path = '%s-deduped.csv' % file_path unique = open(u_path, 'wb') writer = UnicodeCSVDictWriter(unique, unique_rows[0].keys()) writer.writeheader() writer.writerows(unique_rows) unique.close() clusters = open(d_path, 'wb') writer = UnicodeCSVDictWriter(clusters, clustered_rows[0].keys()) writer.writeheader() writer.writerows(clustered_rows) clusters.close() return d_path, u_path
def process_csv(csv_data, template): standard_header = [ 'name', 'gender', 'title', 'department', 'hire_date', 'salary', 'entity', 'type', 'received_date', ] if 'entity_name' in template: file_name = template['entity_name'].lower().replace(' ', '_') else: file_name = raw_input('What should we call this file? (xxx.csv)\n') with open('{0}-ready.csv'.format(file_name), 'wb') as fo: writer = UnicodeCSVDictWriter(fo, standard_header) writer.writeheader() for row in csv_data['rows']: if 'agency' in template: entity_entry = entity(collect_cells(row, template['agency']['columns'])) else: entity_entry = template['entity_name'] writer.writerow({ 'name': name(collect_cells(row, template['name']['columns']), **template['name']['options']), 'gender': gender(collect_cells(row, template['gender']['columns']), **template['gender']['options']), 'title': title_department(collect_cells(row, template['title']['columns']), **template['title']['options']), 'department': title_department(collect_cells(row, template['department']['columns']), **template['department']['options']), 'hire_date': hire_date(collect_cells(row, template['hire_date']['columns']), **template['hire_date']['options']), 'salary': salary(collect_cells(row, template['salary']['columns']), **template['salary']['options']), 'entity': entity_entry, 'type': template['entity_type'], 'received_date': template['received_date'], }) sys.stdout.write('File processed.\n')
AWS_SECRET = os.environ['AWS_SECRET_KEY'] inp = StringIO() s3_conn = S3Connection(AWS_KEY, AWS_SECRET) bucket = s3_conn.get_bucket('il-elections') k = Key(bucket) k.key = 'Committees.csv' committee_file = k.get_contents_to_file(inp) inp.seek(0) reader = UnicodeCSVDictReader(inp) comm_ids = [c['id'] for c in list(reader)] # Now scrape Officer pages officer_pattern = '/CommitteeDetailOfficers.aspx?id=%s' officer_scraper = OfficerScraper(url_pattern=officer_pattern) officer_scraper.cache_storage = scrapelib.cache.FileCache(cache_dir) officer_scraper.cache_write_only = False officer_header = ['id', 'committee_id', 'name', 'title', 'address'] officer_outp = StringIO() officer_writer = UnicodeCSVDictWriter(officer_outp, officer_header) officer_writer.writeheader() officers = [] for comm_id in comm_ids: for officer in officer_scraper.scrape_one(comm_id): officer['committee_id'] = comm_id officers.append(officer) officer_writer.writerows(officers) k.key = 'Officers.csv' k.set_contents_from_string(officer_outp.getvalue()) k.make_public()
if os.path.exists(DB_NAME): os.remove(DB_NAME) report_pattern = '/CommitteeDetail.aspx?id=%s&pageindex=%s' report_scraper = ReportScraper(url_pattern=report_pattern) report_scraper.cache_storage = scrapelib.cache.FileCache(cache_dir) report_scraper.cache_write_only = False conn = sqlite3.connect(DB_NAME) c = conn.cursor() for comm_url in comm_urls: for report_data in report_scraper.scrape_one(comm_url): comm_id = parse_qs(urlparse(comm_url).query)['id'][0] report_data['committee_id'] = comm_id outp = StringIO() writer = UnicodeCSVDictWriter(outp, fieldnames=report_data.keys()) writer.writeheader() writer.writerow(report_data) outp.seek(0) t = Table.from_csv(outp, name='reports') sql_table = make_table(t) try: c.execute('select * from reports limit 1') except sqlite3.OperationalError: create_st = make_create_table_statement(sql_table) c.execute(create_st) conn.commit() c.execute('select * from reports where id = ?', (int(report_data['id']),)) existing = c.fetchall() if not existing: insert = sql_table.insert()
inp = StringIO() s3_conn = S3Connection(AWS_KEY, AWS_SECRET) bucket = s3_conn.get_bucket('il-elections') k = Key(bucket) k.key = 'Committees.csv' committee_file = k.get_contents_to_file(inp) inp.seek(0) reader = UnicodeCSVDictReader(inp) comm_ids = [c['id'] for c in list(reader)] # Now scrape Officer pages officer_pattern = '/CommitteeDetailOfficers.aspx?id=%s' officer_scraper = OfficerScraper(url_pattern=officer_pattern) officer_scraper.cache_storage = scrapelib.cache.FileCache(cache_dir) officer_scraper.cache_write_only = False officer_header = ['id', 'committee_id', 'name', 'title', 'address'] officer_outp = StringIO() officer_writer = UnicodeCSVDictWriter(officer_outp, officer_header) officer_writer.writeheader() officers = [] for comm_id in comm_ids: for officer in officer_scraper.scrape_one(comm_id): officer['committee_id'] = comm_id officers.append(officer) officer_writer.writerows(officers) k.key = 'Officers.csv' k.set_contents_from_string(officer_outp.getvalue()) k.make_public()
from cStringIO import StringIO import os from boto.s3.connection import S3Connection from boto.s3.key import Key from csvkit.unicsv import UnicodeCSVDictWriter AWS_KEY = os.environ['AWS_ACCESS_KEY'] AWS_SECRET = os.environ['AWS_SECRET_KEY'] url_pattern = '/CommitteeDetail.aspx?id=%s' string_on_page = 'ctl00_ContentPlaceHolder1_CommitteeResultsLayout' comm_scraper = CommitteeScraper(url_pattern=url_pattern, string_on_page=string_on_page) # comm_scraper.cache_storage = scrapelib.cache.FileCache('cache') # comm_scraper.cache_write_only = False committees = [] comms_outp = StringIO() comm_header = ['id', 'name', 'type', 'url', 'address', 'status', 'purpose', 'state_id', 'local_id', 'creation_date'] comm_writer = UnicodeCSVDictWriter(comms_outp, comm_header, delimiter='\t') comm_writer.writeheader() for committee in comm_scraper.scrape_all(): # Save to DB and maybe write as JSON? committees.append(committee) comm_writer.writerows(committees) s3_conn = S3Connection(AWS_KEY, AWS_SECRET) bucket = s3_conn.get_bucket('il-elections') k = Key(bucket) k.key = 'Committees.tsv' k.set_contents_from_string(comms_outp.getvalue()) k.make_public()
from boto.s3.key import Key from csvkit.unicsv import UnicodeCSVDictWriter AWS_KEY = os.environ['AWS_ACCESS_KEY'] AWS_SECRET = os.environ['AWS_SECRET_KEY'] url_pattern = '/CommitteeDetail.aspx?id=%s' string_on_page = 'ctl00_ContentPlaceHolder1_CommitteeResultsLayout' comm_scraper = CommitteeScraper(url_pattern=url_pattern, string_on_page=string_on_page) # comm_scraper.cache_storage = scrapelib.cache.FileCache('cache') # comm_scraper.cache_write_only = False committees = [] comms_outp = StringIO() comm_header = [ 'id', 'name', 'type', 'url', 'address', 'status', 'purpose', 'state_id', 'local_id', 'creation_date' ] comm_writer = UnicodeCSVDictWriter(comms_outp, comm_header, delimiter='\t') comm_writer.writeheader() for committee in comm_scraper.scrape_all(): # Save to DB and maybe write as JSON? committees.append(committee) comm_writer.writerows(committees) s3_conn = S3Connection(AWS_KEY, AWS_SECRET) bucket = s3_conn.get_bucket('il-elections') k = Key(bucket) k.key = 'Committees.tsv' k.set_contents_from_string(comms_outp.getvalue()) k.make_public()
if os.path.exists(DB_NAME): os.remove(DB_NAME) report_pattern = '/CommitteeDetail.aspx?id=%s&pageindex=%s' report_scraper = ReportScraper(url_pattern=report_pattern) report_scraper.cache_storage = scrapelib.cache.FileCache(cache_dir) report_scraper.cache_write_only = False conn = sqlite3.connect(DB_NAME) c = conn.cursor() for comm_url in comm_urls: for report_data in report_scraper.scrape_one(comm_url): comm_id = parse_qs(urlparse(comm_url).query)['id'][0] report_data['committee_id'] = comm_id outp = StringIO() writer = UnicodeCSVDictWriter(outp, fieldnames=report_data.keys()) writer.writeheader() writer.writerow(report_data) outp.seek(0) t = Table.from_csv(outp, name='reports') sql_table = make_table(t) try: c.execute('select * from reports limit 1') except sqlite3.OperationalError: create_st = make_create_table_statement(sql_table) c.execute(create_st) conn.commit() c.execute('select * from reports where id = ?', (int(report_data['id']), )) existing = c.fetchall() if not existing: