示例#1
0
def download_csv():
    """
    Generate a data download.
    """
    f = cStringIO.StringIO()

    writer = UnicodeCSVDictWriter(f, [
        'lobbyist_first_name',
        'lobbyist_last_name',
        'report_period',
        'recipient_name',
        'recipient_type',
        'legislator_first_name',
        'legislator_last_name',
        'legislator_office',
        'legislator_party',
        'legislator_district',
        'event_date',
        'category',
        'description',
        'cost',
        'organization_name',
        'organization_industry',
        'group',
        'ethics_board_id',
        'is_solicitation'
    ])

    writer.writeheader()

    expenditures = Expenditure.select()

    for ex in expenditures:
        row = {
            'lobbyist_first_name': ex.lobbyist.first_name,
            'lobbyist_last_name': ex.lobbyist.last_name,
            'report_period': ex.report_period,
            'recipient_name': ex.recipient,
            'recipient_type': ex.recipient_type,
            'legislator_first_name': ex.legislator.first_name if ex.legislator else None,
            'legislator_last_name': ex.legislator.last_name if ex.legislator else None,
            'legislator_office': ex.legislator.office if ex.legislator else None,
            'legislator_party': ex.legislator.party if ex.legislator else None,
            'legislator_district': ex.legislator.district if ex.legislator else None,
            'event_date': ex.event_date,
            'category': ex.category,
            'description': ex.description,
            'cost': ex.cost,
            'organization_name': ex.organization.name,
            'organization_industry': ex.organization.category,
            'group': ex.group.name if ex.group else None,
            'ethics_board_id': ex.ethics_id,
            'is_solicitation': ex.is_solicitation
        }

        writer.writerow(row)

    return f.getvalue().decode('utf-8')
示例#2
0
def writeCSV(unique_rows, clustered_rows, file_path):
    u_path = '%s-deduped_unique.csv' % file_path
    d_path = '%s-deduped.csv' % file_path
    unique = open(u_path, 'wb')
    writer = UnicodeCSVDictWriter(unique, unique_rows[0].keys())
    writer.writeheader()
    writer.writerows(unique_rows)
    unique.close()
    clusters = open(d_path, 'wb')
    writer = UnicodeCSVDictWriter(clusters, clustered_rows[0].keys())
    writer.writeheader()
    writer.writerows(clustered_rows)
    clusters.close()
    return d_path, u_path
def process_csv(csv_data, template):

    standard_header = [
        'name',
        'gender',
        'title',
        'department',
        'hire_date',
        'salary',
        'entity',
        'type',
        'received_date',
    ]

    if 'entity_name' in template:
        file_name = template['entity_name'].lower().replace(' ', '_')
    else:
        file_name = raw_input('What should we call this file? (xxx.csv)\n')

    with open('{0}-ready.csv'.format(file_name), 'wb') as fo:
        writer = UnicodeCSVDictWriter(fo, standard_header)
        writer.writeheader()

        for row in csv_data['rows']:

            if 'agency' in template:
                entity_entry = entity(collect_cells(row, template['agency']['columns']))
            else:
                entity_entry = template['entity_name']

            writer.writerow({
                'name': name(collect_cells(row, template['name']['columns']), **template['name']['options']),
                'gender': gender(collect_cells(row, template['gender']['columns']), **template['gender']['options']),
                'title': title_department(collect_cells(row, template['title']['columns']), **template['title']['options']),
                'department': title_department(collect_cells(row, template['department']['columns']), **template['department']['options']),
                'hire_date': hire_date(collect_cells(row, template['hire_date']['columns']), **template['hire_date']['options']),
                'salary': salary(collect_cells(row, template['salary']['columns']), **template['salary']['options']),
                'entity': entity_entry,
                'type': template['entity_type'],
                'received_date': template['received_date'],
            })

        sys.stdout.write('File processed.\n')
示例#4
0
    AWS_SECRET = os.environ['AWS_SECRET_KEY']

    inp = StringIO()
    s3_conn = S3Connection(AWS_KEY, AWS_SECRET)
    bucket = s3_conn.get_bucket('il-elections')
    k = Key(bucket)
    k.key = 'Committees.csv'
    committee_file = k.get_contents_to_file(inp)
    inp.seek(0)
    reader = UnicodeCSVDictReader(inp)
    comm_ids = [c['id'] for c in list(reader)]

    # Now scrape Officer pages
    officer_pattern = '/CommitteeDetailOfficers.aspx?id=%s'
    officer_scraper = OfficerScraper(url_pattern=officer_pattern)
    officer_scraper.cache_storage = scrapelib.cache.FileCache(cache_dir)
    officer_scraper.cache_write_only = False
    officer_header = ['id', 'committee_id', 'name', 'title', 'address']
    officer_outp = StringIO()
    officer_writer = UnicodeCSVDictWriter(officer_outp, officer_header)
    officer_writer.writeheader()
    officers = []
    for comm_id in comm_ids:
        for officer in officer_scraper.scrape_one(comm_id):
            officer['committee_id'] = comm_id
            officers.append(officer)
    officer_writer.writerows(officers)
    k.key = 'Officers.csv'
    k.set_contents_from_string(officer_outp.getvalue())
    k.make_public()
    if os.path.exists(DB_NAME):
        os.remove(DB_NAME)
    
    report_pattern = '/CommitteeDetail.aspx?id=%s&pageindex=%s'
    report_scraper = ReportScraper(url_pattern=report_pattern)
    report_scraper.cache_storage = scrapelib.cache.FileCache(cache_dir)
    report_scraper.cache_write_only = False
    conn = sqlite3.connect(DB_NAME)
    c = conn.cursor()
    for comm_url in comm_urls:
        for report_data in report_scraper.scrape_one(comm_url):
            comm_id = parse_qs(urlparse(comm_url).query)['id'][0]
            report_data['committee_id'] = comm_id
            outp = StringIO()
            writer = UnicodeCSVDictWriter(outp, fieldnames=report_data.keys())
            writer.writeheader()
            writer.writerow(report_data)
            outp.seek(0)
            t = Table.from_csv(outp, name='reports')
            sql_table = make_table(t)
            try:
                c.execute('select * from reports limit 1')
            except sqlite3.OperationalError:
                create_st = make_create_table_statement(sql_table)
                c.execute(create_st)
                conn.commit()
            c.execute('select * from reports where id = ?', (int(report_data['id']),))
            existing = c.fetchall()
            if not existing:
                insert = sql_table.insert()
    inp = StringIO()
    s3_conn = S3Connection(AWS_KEY, AWS_SECRET)
    bucket = s3_conn.get_bucket('il-elections')
    k = Key(bucket)
    k.key = 'Committees.csv'
    committee_file = k.get_contents_to_file(inp)
    inp.seek(0)
    reader = UnicodeCSVDictReader(inp)
    comm_ids = [c['id'] for c in list(reader)]
    
    # Now scrape Officer pages
    officer_pattern = '/CommitteeDetailOfficers.aspx?id=%s'
    officer_scraper = OfficerScraper(url_pattern=officer_pattern)
    officer_scraper.cache_storage = scrapelib.cache.FileCache(cache_dir)
    officer_scraper.cache_write_only = False
    officer_header = ['id', 'committee_id', 'name', 'title', 'address']
    officer_outp = StringIO()
    officer_writer = UnicodeCSVDictWriter(officer_outp, officer_header)
    officer_writer.writeheader()
    officers = []
    for comm_id in comm_ids:
        for officer in officer_scraper.scrape_one(comm_id):
            officer['committee_id'] = comm_id
            officers.append(officer)
    officer_writer.writerows(officers)
    k.key = 'Officers.csv'
    k.set_contents_from_string(officer_outp.getvalue())
    k.make_public()

示例#7
0
def download_csv():
    """
    Generate a data download.
    """
    f = cStringIO.StringIO()

    writer = UnicodeCSVDictWriter(f, [
        'lobbyist_first_name', 'lobbyist_last_name', 'report_period',
        'recipient_name', 'recipient_type', 'legislator_first_name',
        'legislator_last_name', 'legislator_office', 'legislator_party',
        'legislator_district', 'event_date', 'category', 'description', 'cost',
        'organization_name', 'organization_industry', 'group',
        'ethics_board_id', 'is_solicitation'
    ])

    writer.writeheader()

    expenditures = Expenditure.select()

    for ex in expenditures:
        row = {
            'lobbyist_first_name':
            ex.lobbyist.first_name,
            'lobbyist_last_name':
            ex.lobbyist.last_name,
            'report_period':
            ex.report_period,
            'recipient_name':
            ex.recipient,
            'recipient_type':
            ex.recipient_type,
            'legislator_first_name':
            ex.legislator.first_name if ex.legislator else None,
            'legislator_last_name':
            ex.legislator.last_name if ex.legislator else None,
            'legislator_office':
            ex.legislator.office if ex.legislator else None,
            'legislator_party':
            ex.legislator.party if ex.legislator else None,
            'legislator_district':
            ex.legislator.district if ex.legislator else None,
            'event_date':
            ex.event_date,
            'category':
            ex.category,
            'description':
            ex.description,
            'cost':
            ex.cost,
            'organization_name':
            ex.organization.name,
            'organization_industry':
            ex.organization.category,
            'group':
            ex.group.name if ex.group else None,
            'ethics_board_id':
            ex.ethics_id,
            'is_solicitation':
            ex.is_solicitation
        }

        writer.writerow(row)

    return f.getvalue().decode('utf-8')
    from cStringIO import StringIO
    import os
    from boto.s3.connection import S3Connection
    from boto.s3.key import Key
    from csvkit.unicsv import UnicodeCSVDictWriter

    AWS_KEY = os.environ['AWS_ACCESS_KEY']
    AWS_SECRET = os.environ['AWS_SECRET_KEY']

    url_pattern = '/CommitteeDetail.aspx?id=%s'
    string_on_page = 'ctl00_ContentPlaceHolder1_CommitteeResultsLayout'
    comm_scraper = CommitteeScraper(url_pattern=url_pattern, string_on_page=string_on_page)
    # comm_scraper.cache_storage = scrapelib.cache.FileCache('cache')
    # comm_scraper.cache_write_only = False
    committees = []
    comms_outp = StringIO()
    comm_header = ['id', 'name', 'type', 'url', 'address', 'status', 'purpose', 'state_id', 'local_id', 'creation_date']
    comm_writer = UnicodeCSVDictWriter(comms_outp, comm_header, delimiter='\t')
    comm_writer.writeheader()
    for committee in comm_scraper.scrape_all():
        # Save to DB and maybe write as JSON?
        committees.append(committee)
    comm_writer.writerows(committees)
    s3_conn = S3Connection(AWS_KEY, AWS_SECRET)
    bucket = s3_conn.get_bucket('il-elections')
    k = Key(bucket)
    k.key = 'Committees.tsv'
    k.set_contents_from_string(comms_outp.getvalue())
    k.make_public()

示例#9
0
    from boto.s3.key import Key
    from csvkit.unicsv import UnicodeCSVDictWriter

    AWS_KEY = os.environ['AWS_ACCESS_KEY']
    AWS_SECRET = os.environ['AWS_SECRET_KEY']

    url_pattern = '/CommitteeDetail.aspx?id=%s'
    string_on_page = 'ctl00_ContentPlaceHolder1_CommitteeResultsLayout'
    comm_scraper = CommitteeScraper(url_pattern=url_pattern,
                                    string_on_page=string_on_page)
    # comm_scraper.cache_storage = scrapelib.cache.FileCache('cache')
    # comm_scraper.cache_write_only = False
    committees = []
    comms_outp = StringIO()
    comm_header = [
        'id', 'name', 'type', 'url', 'address', 'status', 'purpose',
        'state_id', 'local_id', 'creation_date'
    ]
    comm_writer = UnicodeCSVDictWriter(comms_outp, comm_header, delimiter='\t')
    comm_writer.writeheader()
    for committee in comm_scraper.scrape_all():
        # Save to DB and maybe write as JSON?
        committees.append(committee)
    comm_writer.writerows(committees)
    s3_conn = S3Connection(AWS_KEY, AWS_SECRET)
    bucket = s3_conn.get_bucket('il-elections')
    k = Key(bucket)
    k.key = 'Committees.tsv'
    k.set_contents_from_string(comms_outp.getvalue())
    k.make_public()
    if os.path.exists(DB_NAME):
        os.remove(DB_NAME)

    report_pattern = '/CommitteeDetail.aspx?id=%s&pageindex=%s'
    report_scraper = ReportScraper(url_pattern=report_pattern)
    report_scraper.cache_storage = scrapelib.cache.FileCache(cache_dir)
    report_scraper.cache_write_only = False
    conn = sqlite3.connect(DB_NAME)
    c = conn.cursor()
    for comm_url in comm_urls:
        for report_data in report_scraper.scrape_one(comm_url):
            comm_id = parse_qs(urlparse(comm_url).query)['id'][0]
            report_data['committee_id'] = comm_id
            outp = StringIO()
            writer = UnicodeCSVDictWriter(outp, fieldnames=report_data.keys())
            writer.writeheader()
            writer.writerow(report_data)
            outp.seek(0)
            t = Table.from_csv(outp, name='reports')
            sql_table = make_table(t)
            try:
                c.execute('select * from reports limit 1')
            except sqlite3.OperationalError:
                create_st = make_create_table_statement(sql_table)
                c.execute(create_st)
                conn.commit()
            c.execute('select * from reports where id = ?',
                      (int(report_data['id']), ))
            existing = c.fetchall()
            if not existing: