示例#1
0
def writeCSV(unique_rows, clustered_rows, file_path):
    u_path = '%s-deduped_unique.csv' % file_path
    d_path = '%s-deduped.csv' % file_path
    unique = open(u_path, 'wb')
    writer = UnicodeCSVDictWriter(unique, unique_rows[0].keys())
    writer.writeheader()
    writer.writerows(unique_rows)
    unique.close()
    clusters = open(d_path, 'wb')
    writer = UnicodeCSVDictWriter(clusters, clustered_rows[0].keys())
    writer.writeheader()
    writer.writerows(clustered_rows)
    clusters.close()
    return d_path, u_path
示例#2
0
    AWS_SECRET = os.environ['AWS_SECRET_KEY']

    inp = StringIO()
    s3_conn = S3Connection(AWS_KEY, AWS_SECRET)
    bucket = s3_conn.get_bucket('il-elections')
    k = Key(bucket)
    k.key = 'Committees.csv'
    committee_file = k.get_contents_to_file(inp)
    inp.seek(0)
    reader = UnicodeCSVDictReader(inp)
    comm_ids = [c['id'] for c in list(reader)]

    # Now scrape Officer pages
    officer_pattern = '/CommitteeDetailOfficers.aspx?id=%s'
    officer_scraper = OfficerScraper(url_pattern=officer_pattern)
    officer_scraper.cache_storage = scrapelib.cache.FileCache(cache_dir)
    officer_scraper.cache_write_only = False
    officer_header = ['id', 'committee_id', 'name', 'title', 'address']
    officer_outp = StringIO()
    officer_writer = UnicodeCSVDictWriter(officer_outp, officer_header)
    officer_writer.writeheader()
    officers = []
    for comm_id in comm_ids:
        for officer in officer_scraper.scrape_one(comm_id):
            officer['committee_id'] = comm_id
            officers.append(officer)
    officer_writer.writerows(officers)
    k.key = 'Officers.csv'
    k.set_contents_from_string(officer_outp.getvalue())
    k.make_public()
            existing = c.fetchall()
            if not existing:
                insert = sql_table.insert()
                headers = t.headers()
                rows = [dict(zip(headers, row)) for row in t.to_rows()]
                for row in rows:
                    c.execute(str(insert), row)
                conn.commit()
            else:
                print 'Already saved report %s' % report_data['detail_url']
    c.execute('select date_filed from reports order by date_filed limit 1')
    oldest_year = parser.parse(c.fetchone()[0]).year
    c.execute('select date_filed from reports order by date_filed desc limit 1')
    newest_year = parser.parse(c.fetchone()[0]).year
    c.execute('select * from reports limit 1')
    header = list(map(lambda x: x[0], c.description))
    for year in range(oldest_year, newest_year + 1):
        oldest_date = '%s-01-01' % year
        newest_date = '%s-12-31' % year
        c.execute('select * from reports where date_filed >= ? and date_filed <= ?', (oldest_date, newest_date))
        rows = c.fetchall()
        outp = StringIO()
        writer = UnicodeCSVWriter(outp)
        writer.writerow(header)
        writer.writerows(rows)
        outp.seek(0)
        k.key = 'Reports/%s.csv' % year
        k.set_contents_from_file(outp)
        k.make_public()

    inp = StringIO()
    s3_conn = S3Connection(AWS_KEY, AWS_SECRET)
    bucket = s3_conn.get_bucket('il-elections')
    k = Key(bucket)
    k.key = 'Committees.csv'
    committee_file = k.get_contents_to_file(inp)
    inp.seek(0)
    reader = UnicodeCSVDictReader(inp)
    comm_ids = [c['id'] for c in list(reader)]
    
    # Now scrape Officer pages
    officer_pattern = '/CommitteeDetailOfficers.aspx?id=%s'
    officer_scraper = OfficerScraper(url_pattern=officer_pattern)
    officer_scraper.cache_storage = scrapelib.cache.FileCache(cache_dir)
    officer_scraper.cache_write_only = False
    officer_header = ['id', 'committee_id', 'name', 'title', 'address']
    officer_outp = StringIO()
    officer_writer = UnicodeCSVDictWriter(officer_outp, officer_header)
    officer_writer.writeheader()
    officers = []
    for comm_id in comm_ids:
        for officer in officer_scraper.scrape_one(comm_id):
            officer['committee_id'] = comm_id
            officers.append(officer)
    officer_writer.writerows(officers)
    k.key = 'Officers.csv'
    k.set_contents_from_string(officer_outp.getvalue())
    k.make_public()

    from cStringIO import StringIO
    import os
    from boto.s3.connection import S3Connection
    from boto.s3.key import Key
    from csvkit.unicsv import UnicodeCSVDictWriter

    AWS_KEY = os.environ['AWS_ACCESS_KEY']
    AWS_SECRET = os.environ['AWS_SECRET_KEY']

    url_pattern = '/CommitteeDetail.aspx?id=%s'
    string_on_page = 'ctl00_ContentPlaceHolder1_CommitteeResultsLayout'
    comm_scraper = CommitteeScraper(url_pattern=url_pattern, string_on_page=string_on_page)
    # comm_scraper.cache_storage = scrapelib.cache.FileCache('cache')
    # comm_scraper.cache_write_only = False
    committees = []
    comms_outp = StringIO()
    comm_header = ['id', 'name', 'type', 'url', 'address', 'status', 'purpose', 'state_id', 'local_id', 'creation_date']
    comm_writer = UnicodeCSVDictWriter(comms_outp, comm_header, delimiter='\t')
    comm_writer.writeheader()
    for committee in comm_scraper.scrape_all():
        # Save to DB and maybe write as JSON?
        committees.append(committee)
    comm_writer.writerows(committees)
    s3_conn = S3Connection(AWS_KEY, AWS_SECRET)
    bucket = s3_conn.get_bucket('il-elections')
    k = Key(bucket)
    k.key = 'Committees.tsv'
    k.set_contents_from_string(comms_outp.getvalue())
    k.make_public()

示例#6
0
    from boto.s3.key import Key
    from csvkit.unicsv import UnicodeCSVDictWriter

    AWS_KEY = os.environ['AWS_ACCESS_KEY']
    AWS_SECRET = os.environ['AWS_SECRET_KEY']

    url_pattern = '/CommitteeDetail.aspx?id=%s'
    string_on_page = 'ctl00_ContentPlaceHolder1_CommitteeResultsLayout'
    comm_scraper = CommitteeScraper(url_pattern=url_pattern,
                                    string_on_page=string_on_page)
    # comm_scraper.cache_storage = scrapelib.cache.FileCache('cache')
    # comm_scraper.cache_write_only = False
    committees = []
    comms_outp = StringIO()
    comm_header = [
        'id', 'name', 'type', 'url', 'address', 'status', 'purpose',
        'state_id', 'local_id', 'creation_date'
    ]
    comm_writer = UnicodeCSVDictWriter(comms_outp, comm_header, delimiter='\t')
    comm_writer.writeheader()
    for committee in comm_scraper.scrape_all():
        # Save to DB and maybe write as JSON?
        committees.append(committee)
    comm_writer.writerows(committees)
    s3_conn = S3Connection(AWS_KEY, AWS_SECRET)
    bucket = s3_conn.get_bucket('il-elections')
    k = Key(bucket)
    k.key = 'Committees.tsv'
    k.set_contents_from_string(comms_outp.getvalue())
    k.make_public()
                insert = sql_table.insert()
                headers = t.headers()
                rows = [dict(zip(headers, row)) for row in t.to_rows()]
                for row in rows:
                    c.execute(str(insert), row)
                conn.commit()
            else:
                print 'Already saved report %s' % report_data['detail_url']
    c.execute('select date_filed from reports order by date_filed limit 1')
    oldest_year = parser.parse(c.fetchone()[0]).year
    c.execute(
        'select date_filed from reports order by date_filed desc limit 1')
    newest_year = parser.parse(c.fetchone()[0]).year
    c.execute('select * from reports limit 1')
    header = list(map(lambda x: x[0], c.description))
    for year in range(oldest_year, newest_year + 1):
        oldest_date = '%s-01-01' % year
        newest_date = '%s-12-31' % year
        c.execute(
            'select * from reports where date_filed >= ? and date_filed <= ?',
            (oldest_date, newest_date))
        rows = c.fetchall()
        outp = StringIO()
        writer = UnicodeCSVWriter(outp)
        writer.writerow(header)
        writer.writerows(rows)
        outp.seek(0)
        k.key = 'Reports/%s.csv' % year
        k.set_contents_from_file(outp)
        k.make_public()