Пример #1
0
def makeRawTable(contents):
    inp = StringIO(contents)
    reader = UnicodeCSVReader(inp)
    header = reader.next()
    header = [slugify(h) for h in header]
    outp = StringIO()
    writer = UnicodeCSVWriter(outp)
    writer.writerow(header)
    writer.writerows([[preProcess(unicode(i)) for i in r] for r in reader])
    outp.seek(0)
    conn = sqlite3.connect(':memory:')
    t = Table.from_csv(outp, 
                       name='raw_table', 
                       blanks_as_nulls=False, 
                       infer_types=False)
    sql_table = make_table(t)
    create_st = make_create_table_statement(sql_table)
    parts = create_st.split('raw_table (')
    create_st = '{0} raw_table ( record_id INTEGER PRIMARY KEY,{1}'.format(*parts)
    insert = sql_table.insert()
    curs = conn.cursor()
    curs.execute(create_st)
    rows = [dict(zip(header, row)) for row in t.to_rows()]
    for row in rows:
        curs.execute(str(insert), row)
    dump = StringIO()
    for line in conn.iterdump():
        dump.write(unidecode(line))
    dump.seek(0)
    return dump.getvalue(), header
Пример #2
0
def clean(f):
    reader = UnicodeCSVReader(f)
    good = []
    bad = []
    header = reader.next()
    for row in reader:
        try:
            row[0] = int(row[0])
            row[3] = int(row[3])
            row[5] = int(row[5])
            row[7] = int(row[7])
            row[4] = row[4].replace(',', '')
            if len(row) == 12:
                good.append(row)
            else:
                bad.append(row)
        except (TypeError, ValueError):
           bad.append(row)
    goodf = open('data/trips_cleaned.csv', 'wb')
    badf = open('data/trips_dirty.csv', 'wb')
    goodwriter = UnicodeCSVWriter(goodf)
    goodwriter.writerow(header)
    goodwriter.writerows(good)
    badwriter = UnicodeCSVWriter(badf)
    badwriter.writerow(header)
    badwriter.writerows(bad)
    goodf.close()
    badf.close()
Пример #3
0
 def _transform(self):
     reader = UnicodeCSVReader(self.station_raw_info)
     header = ['wban_code', 'station_name', 'country', 
               'state', 'call_sign', 'location', 'elevation', 
               'begin', 'end']
     reader.next()
     self.clean_station_info = StringIO()
     all_rows = []
     wbans = []
     for row in reader:
         if row[1] == '99999':
             continue
         elif row[1] in wbans:
             continue
         elif row[5] and row[6]:
             row.pop(0)
             row.pop(3)
             lat = row[5].replace('+', '')
             lon = row[6].replace('+', '')
             elev = row[7].replace('+', '')
             begin = parser.parse(row[8]).isoformat()
             end = parser.parse(row[9]).isoformat()
             row[5] = 'SRID=4326;POINT(%s %s)' % ((float(lon) / 1000), (float(lat) / 1000))
             row[6] = float(elev) / 10
             row[7] = begin
             row[8] = end
             row.pop()
             wbans.append(row[0])
             all_rows.append(row)
     writer = UnicodeCSVWriter(self.clean_station_info)
     writer.writerow(header)
     writer.writerows(all_rows)
     self.clean_station_info.seek(0)
Пример #4
0
 def _transform(self):
     reader = UnicodeCSVReader(self.station_raw_info)
     header = [
         'wban_code', 'station_name', 'country', 'state', 'call_sign',
         'location', 'elevation', 'begin', 'end'
     ]
     reader.next()
     self.clean_station_info = StringIO()
     all_rows = []
     wbans = []
     for row in reader:
         if row[1] == '99999':
             continue
         elif row[1] in wbans:
             continue
         elif row[5] and row[6]:
             row.pop(0)
             row.pop(3)
             lat = row[5].replace('+', '')
             lon = row[6].replace('+', '')
             elev = row[7].replace('+', '')
             begin = parser.parse(row[8]).isoformat()
             end = parser.parse(row[9]).isoformat()
             row[5] = 'SRID=4326;POINT(%s %s)' % ((float(lon) / 1000),
                                                  (float(lat) / 1000))
             row[6] = float(elev) / 10
             row[7] = begin
             row[8] = end
             row.pop()
             wbans.append(row[0])
             all_rows.append(row)
     writer = UnicodeCSVWriter(self.clean_station_info)
     writer.writerow(header)
     writer.writerows(all_rows)
     self.clean_station_info.seek(0)
Пример #5
0
def writeCSV(fpath, output):
    with open(fpath, 'wb') as f:
        writer = UnicodeCSVWriter(f)
        writer.writerows(output)
Пример #6
0
def writeBlockingMap(session_id, block_data, canonical=False):
    pk_type = Integer
    if canonical:
        session_id = '{0}_cr'.format(session_id)
        pk_type = String
    metadata = MetaData()
    engine = worker_session.bind
    bkm = Table('block_{0}'.format(session_id), metadata,
        Column('block_key', Text),
        Column('record_id', pk_type)
    )
    bkm.drop(engine, checkfirst=True)
    bkm.create(engine)
    with open('/tmp/{0}.csv'.format(session_id), 'wb') as s:
        writer = UnicodeCSVWriter(s)
        writer.writerows(block_data)
    conn = engine.raw_connection()
    cur = conn.cursor()
    with open('/tmp/{0}.csv'.format(session_id), 'rb') as s:
        cur.copy_expert('COPY "block_{0}" FROM STDIN CSV'.format(session_id), s)
    conn.commit()
    
    os.remove('/tmp/{0}.csv'.format(session_id))

    block_key_idx = Index('bk_{0}_idx'.format(session_id), bkm.c.block_key)
    block_key_idx.create(engine)

    plural_key = Table('plural_key_{0}'.format(session_id), metadata,
        Column('block_key', Text),
        Column('block_id', Integer, primary_key=True)
    )
    plural_key.drop(engine, checkfirst=True)
    plural_key.create(engine)
    bkm_sel = select([bkm.c.block_key], from_obj=bkm)\
        .group_by(bkm.c.block_key)\
        .having(func.count(bkm.c.block_key) > 1)
    pl_ins = plural_key.insert()\
        .from_select([plural_key.c.block_key], bkm_sel)
    with engine.begin() as c:
        c.execute(pl_ins)
    
    pl_key_idx = Index('pk_{0}_idx'.format(session_id), plural_key.c.block_key)
    pl_key_idx.create(engine)

    with engine.begin() as c:
        c.execute('DROP TABLE IF EXISTS "plural_block_{0}"'.format(session_id))
    pl_bk_stmt = '''
        CREATE TABLE "plural_block_{0}" AS (
            SELECT p.block_id, b.record_id 
                FROM "block_{0}" AS b
                INNER JOIN "plural_key_{0}" AS p
                USING (block_key)
            )'''.format(session_id)
    with engine.begin() as c:
        c.execute(pl_bk_stmt)
    with engine.begin() as c:
        c.execute('''
            CREATE INDEX "pl_bk_idx_{0}" 
            ON "plural_block_{0}" (record_id)'''.format(session_id)
        )
    with engine.begin() as c:
        c.execute('DROP INDEX IF EXISTS "pl_bk_id_idx_{0}"'.format(session_id))
    with engine.begin() as c:
        c.execute(''' 
            CREATE UNIQUE INDEX "pl_bk_id_idx_{0}" on "plural_block_{0}" 
            (block_id, record_id) '''.format(session_id)
        )

    cov_bks_stmt = ''' 
        CREATE TABLE "covered_{0}" AS (
            SELECT record_id, 
            string_agg(CAST(block_id AS TEXT), ',' ORDER BY block_id) 
                AS sorted_ids
            FROM "plural_block_{0}"
            GROUP BY record_id
        )
    '''.format(session_id)
    with engine.begin() as c:
        c.execute('DROP TABLE IF EXISTS "covered_{0}"'.format(session_id))
    with engine.begin() as c:
        c.execute(cov_bks_stmt)
    with engine.begin() as c:
        c.execute(''' 
            CREATE UNIQUE INDEX "cov_bks_id_idx_{0}" ON "covered_{0}" (record_id)
            '''.format(session_id)
        )

    with engine.begin() as c:
        c.execute('DROP TABLE IF EXISTS "small_cov_{0}"'.format(session_id))
    small_cov = ''' 
        CREATE TABLE "small_cov_{0}" AS (
            SELECT record_id, 
                   block_id,
                   TRIM(',' FROM split_part(sorted_ids, CAST(block_id AS TEXT), 1))
                       AS smaller_ids
            FROM "plural_block_{0}"
            INNER JOIN "covered_{0}"
            USING (record_id)
        )
    '''.format(session_id)
    with engine.begin() as c:
        c.execute(small_cov)
    with engine.begin() as c:
        c.execute('''
            CREATE INDEX "sc_idx_{0}" 
            ON "small_cov_{0}" (record_id)'''.format(session_id)
        )
    with engine.begin() as c:
        c.execute('''
            CREATE INDEX "sc_bk_idx_{0}" 
            ON "small_cov_{0}" (block_id)'''.format(session_id)
        )
    reader = UnicodeCSVDictReader(inp)
    comm_ids = [i['id'] for i in list(reader)]

    candidate_pattern = '/CommitteeDetailCandidates.aspx?id=%s'
    cand_scraper = CandidateScraper(url_pattern=candidate_pattern)
    cand_scraper.cache_storage = scrapelib.cache.FileCache('/cache/cache')
    cand_scraper.cache_write_only = False
    for comm_id in comm_ids:
        for cand in cand_scraper.scrape_one(comm_id):
            if cand:
                cand['CommitteeID'] = comm_id
                insert = 'insert into candidates("ID", "FullName", "FullAddress", \
                    "PartyName", "OfficeName", "CommitteeID") values (:ID, :FullName, :FullAddress, \
                    :PartyName, :OfficeName, :CommitteeID)'

                c.execute(insert, cand)
                conn.commit()
            else:
                print 'Got a 500 for %s' % comm_id
    c.execute('select * from candidates')
    header = list(map(lambda x: x[0], c.description))
    cands = c.fetchall()
    outp = StringIO()
    writer = UnicodeCSVWriter(outp)
    writer.writerow(header)
    writer.writerows(cands)
    outp.seek(0)
    k.key = 'Candidates.csv'
    k.set_contents_from_file(outp)
    k.make_public()
Пример #8
0
def getMatchingReady(session_id):
    addRowHash(session_id)
    cleanupTables(session_id)
    engine = worker_session.bind
    with engine.begin() as conn:
        conn.execute('DROP TABLE IF EXISTS "match_blocks_{0}"'\
            .format(session_id))
        conn.execute(''' 
            CREATE TABLE "match_blocks_{0}" (
                block_key VARCHAR, 
                record_id BIGINT
            )
            '''.format(session_id))
    sess = worker_session.query(DedupeSession).get(session_id)
    field_defs = json.loads(sess.field_defs)

    # Save Gazetteer settings
    d = dedupe.Gazetteer(field_defs)

    # Disabling canopy based predicates for now
    for definition in d.data_model.primary_fields:
        for idx, predicate in enumerate(definition.predicates):
            if predicate.type == 'TfidfPredicate':
                definition.predicates.pop(idx)

    d.readTraining(StringIO(sess.training_data))
    d.train()
    g_settings = StringIO()
    d.writeSettings(g_settings)
    g_settings.seek(0)
    sess.gaz_settings_file = g_settings.getvalue()
    worker_session.add(sess)
    worker_session.commit()

    # Write match_block table
    model_fields = list(set([f['field'] for f in field_defs]))
    fields = ', '.join(['p.{0}'.format(f) for f in model_fields])
    sel = ''' 
        SELECT 
          p.record_id, 
          {0}
        FROM "processed_{1}" AS p 
        LEFT JOIN "exact_match_{1}" AS e 
          ON p.record_id = e.match 
        WHERE e.record_id IS NULL;
        '''.format(fields, session_id)
    conn = engine.connect()
    rows = conn.execute(sel)
    data = ((getattr(row, 'record_id'), dict(zip(model_fields, row[1:]))) \
        for row in rows)
    block_gen = d.blocker(data)
    s = StringIO()
    writer = UnicodeCSVWriter(s)
    writer.writerows(block_gen)
    conn.close()
    s.seek(0)
    conn = engine.raw_connection()
    curs = conn.cursor()
    try:
        curs.copy_expert('COPY "match_blocks_{0}" FROM STDIN CSV'\
            .format(session_id), s)
        conn.commit()
    except Exception, e: # pragma: no cover
        conn.rollback()
        raise e
Пример #9
0
def writeCSV(fpath, output):
    with open(fpath, 'wb') as f:
        writer = UnicodeCSVWriter(f)
        writer.writerows(output)
def make_db(fname, tblname):
    conn = sqlite3.connect(':memory:')
    t = Table.from_csv(open(fname, 'rb'), name=tblname)
    sql_table = make_table(t)
    create_st = make_create_table_statement(sql_table)
    print create_st
    insert = sql_table.insert()
    curs = conn.cursor()
    curs.execute(create_st)
    headers = t.headers()
    print headers
    rows = [dict(zip(headers, row)) for row in t.to_rows()]
    for row in rows:
        curs.execute(str(insert), row)
    return curs


if __name__ == '__main__':
    curs = make_db(
        'macoupin-budget-update/moucoupin-budget-department-desc.csv',
        'description')
    outp = open('macoupin-budget-update/macoupin-budget-2014-update.csv', 'wb')
    writer = UnicodeCSVWriter(outp)
    with open('macoupin-budget-update/macoupin-budget.csv', 'rb') as f:
        reader = UnicodeCSVReader(f)
        headers = reader.next()
        headers.insert(1, 'Fund ID')
        writer.writerow(headers)
        writer.writerows(add_attrs(reader, curs))
Пример #11
0
            row[7] = res[0]
            row[6] = res[1]
        yield row

def make_db(fname, tblname):
    conn = sqlite3.connect(':memory:')
    t = Table.from_csv(open(fname, 'rb'), name=tblname)
    sql_table = make_table(t)
    create_st = make_create_table_statement(sql_table)
    print create_st
    insert = sql_table.insert()
    curs = conn.cursor()
    curs.execute(create_st)
    headers = t.headers()
    print headers
    rows = [dict(zip(headers, row)) for row in t.to_rows()]
    for row in rows:
        curs.execute(str(insert), row)
    return curs

if __name__ == '__main__':
    curs = make_db('macoupin-budget-update/moucoupin-budget-department-desc.csv', 'description')
    outp = open('macoupin-budget-update/macoupin-budget-2014-update.csv', 'wb')
    writer = UnicodeCSVWriter(outp)
    with open('macoupin-budget-update/macoupin-budget.csv', 'rb') as f:
        reader = UnicodeCSVReader(f)
        headers = reader.next()
        headers.insert(1, 'Fund ID')
        writer.writerow(headers)
        writer.writerows(add_attrs(reader, curs))