Exemplo n.º 1
0
 def _transform(self):
     reader = UnicodeCSVReader(self.station_raw_info)
     header = ['wban_code', 'station_name', 'country', 
               'state', 'call_sign', 'location', 'elevation', 
               'begin', 'end']
     reader.next()
     self.clean_station_info = StringIO()
     all_rows = []
     wbans = []
     for row in reader:
         if row[1] == '99999':
             continue
         elif row[1] in wbans:
             continue
         elif row[5] and row[6]:
             row.pop(0)
             row.pop(3)
             lat = row[5].replace('+', '')
             lon = row[6].replace('+', '')
             elev = row[7].replace('+', '')
             begin = parser.parse(row[8]).isoformat()
             end = parser.parse(row[9]).isoformat()
             row[5] = 'SRID=4326;POINT(%s %s)' % ((float(lon) / 1000), (float(lat) / 1000))
             row[6] = float(elev) / 10
             row[7] = begin
             row[8] = end
             row.pop()
             wbans.append(row[0])
             all_rows.append(row)
     writer = UnicodeCSVWriter(self.clean_station_info)
     writer.writerow(header)
     writer.writerows(all_rows)
     self.clean_station_info.seek(0)
Exemplo n.º 2
0
def clean(f):
    reader = UnicodeCSVReader(f)
    good = []
    bad = []
    header = reader.next()
    for row in reader:
        try:
            row[0] = int(row[0])
            row[3] = int(row[3])
            row[5] = int(row[5])
            row[7] = int(row[7])
            row[4] = row[4].replace(',', '')
            if len(row) == 12:
                good.append(row)
            else:
                bad.append(row)
        except (TypeError, ValueError):
           bad.append(row)
    goodf = open('data/trips_cleaned.csv', 'wb')
    badf = open('data/trips_dirty.csv', 'wb')
    goodwriter = UnicodeCSVWriter(goodf)
    goodwriter.writerow(header)
    goodwriter.writerows(good)
    badwriter = UnicodeCSVWriter(badf)
    badwriter.writerow(header)
    badwriter.writerows(bad)
    goodf.close()
    badf.close()
Exemplo n.º 3
0
def makeRawTable(contents):
    inp = StringIO(contents)
    reader = UnicodeCSVReader(inp)
    header = reader.next()
    header = [slugify(h) for h in header]
    outp = StringIO()
    writer = UnicodeCSVWriter(outp)
    writer.writerow(header)
    writer.writerows([[preProcess(unicode(i)) for i in r] for r in reader])
    outp.seek(0)
    conn = sqlite3.connect(':memory:')
    t = Table.from_csv(outp, 
                       name='raw_table', 
                       blanks_as_nulls=False, 
                       infer_types=False)
    sql_table = make_table(t)
    create_st = make_create_table_statement(sql_table)
    parts = create_st.split('raw_table (')
    create_st = '{0} raw_table ( record_id INTEGER PRIMARY KEY,{1}'.format(*parts)
    insert = sql_table.insert()
    curs = conn.cursor()
    curs.execute(create_st)
    rows = [dict(zip(header, row)) for row in t.to_rows()]
    for row in rows:
        curs.execute(str(insert), row)
    dump = StringIO()
    for line in conn.iterdump():
        dump.write(unidecode(line))
    dump.seek(0)
    return dump.getvalue(), header
Exemplo n.º 4
0
 def __init__(self, f, encoding='utf-8', line_numbers=False, **kwargs):
     self.row_count = 0
     self.line_numbers = line_numbers
     UnicodeCSVWriter.__init__(self,
                               f,
                               encoding,
                               lineterminator='\n',
                               **kwargs)
Exemplo n.º 5
0
    def writerow(self, row):
        if self.line_numbers:
            row = list(row)
            self._append_line_number(row)

        # Convert embedded Mac line endings to unix style line endings so they get quoted
        row = [i.replace('\r', '\n') if isinstance(i, basestring) else i for i in row]

        UnicodeCSVWriter.writerow(self, row)
Exemplo n.º 6
0
def write_csv(filename,data,*headers):
  f = open(filename,'w')
  writer = UnicodeCSVWriter(f)
  writer.writerow(headers)
  for datum in data:
    row = []
    for h in headers:
      row.append(datum.get(h,''))
    writer.writerow(row)
Exemplo n.º 7
0
    def writerow(self, row):
        if self.line_numbers:
            row = list(row)
            self._append_line_number(row)

        # Convert embedded Mac line endings to unix style line endings so they get quoted
        row = [
            i.replace('\r', '\n') if isinstance(i, basestring) else i
            for i in row
        ]

        UnicodeCSVWriter.writerow(self, row)
Exemplo n.º 8
0
    def _transform_daily(self, raw_weather, file_type, start_line=0, end_line=None):
        raw_weather.seek(0)
        reader = UnicodeCSVReader(raw_weather)
        header = reader.next()
        header = [x.strip() for x in header]

        self.clean_observations_daily = StringIO()
        writer = UnicodeCSVWriter(self.clean_observations_daily)
        out_header = ["wban_code","date","temp_max","temp_min",
                      "temp_avg","departure_from_normal",
                      "dewpoint_avg", "wetbulb_avg","weather_types",
                      "snowice_depth", "snowice_waterequiv",
                      "snowfall","precip_total", "station_pressure",
                      "sealevel_pressure", 
                      "resultant_windspeed", "resultant_winddirection", "resultant_winddirection_cardinal",
                      "avg_windspeed",
                      "max5_windspeed", "max5_winddirection","max5_winddirection_cardinal",
                      "max2_windspeed", "max2_winddirection","max2_winddirection_cardinal"]
        writer.writerow(out_header)

        row_count = 0
        for row in reader:
            self.current_row = row
            if (row_count % 100 == 0):
                if (self.debug == True):
                    self.debug_outfile.write("\rdaily parsing: row_count=%06d" % row_count)
                    self.debug_outfile.flush()

            if (start_line > row_count):
                row_count +=1
                continue
            if ((end_line is not None) and (row_count > end_line)):
                break

            row_count += 1
            #print len(header)
            #print len(row)
            #print zip(header,row)

            if (len(row) == 0):
                continue

            row_vals = getattr(self, '_parse_%s_row_daily' % file_type)(row, header)

            writer.writerow(row_vals)
        return self.clean_observations_daily
Exemplo n.º 9
0
    def _transform_hourly(self, raw_weather, file_type, start_line=0, end_line=None):
        raw_weather.seek(0)
        reader = UnicodeCSVReader(raw_weather)
        header= reader.next()
        # strip leading and trailing whitespace from header (e.g. from tarfiles)
        header = [x.strip() for x in header]

        self.clean_observations_hourly = StringIO()
        writer = UnicodeCSVWriter(self.clean_observations_hourly)
        out_header = ["wban_code","datetime","old_station_type","station_type", \
                      "sky_condition","sky_condition_top","visibility",\
                      "weather_types","drybulb_fahrenheit","wetbulb_fahrenheit",\
                      "dewpoint_fahrenheit","relative_humidity",\
                      "wind_speed","wind_direction","wind_direction_cardinal",\
                      "station_pressure","sealevel_pressure","report_type",\
                      "hourly_precip"]
        writer.writerow(out_header)

        row_count = 0
        for row in reader:
            if (row_count % 1000 == 0):
                if (self.debug==True):
                    self.debug_outfile.write( "\rparsing: row_count=%06d" % row_count)
                    self.debug_outfile.flush()

            if (start_line > row_count):
                row_count +=1
                continue
            if ((end_line is not None) and (row_count > end_line)):
                break

            row_count += 1

            if (len(row) == 0):
                continue

            # this calls either self._parse_zipfile_row_hourly
            # or self._parse_tarfile_row_hourly
            row_vals = getattr(self, '_parse_%s_row_hourly' % file_type)(row, header)
            if (not row_vals):
                continue

            writer.writerow(row_vals)
        return self.clean_observations_hourly
Exemplo n.º 10
0
def write_table_data(flo, state_fips, sumlev, table_id):
    """Given a File-Like Object, write a table to it"""
    w = UnicodeCSVWriter(flo)

    metadata = fetch_table_label(table_id)

    header = ['GEOID', 'SUMLEV'] + METADATA_HEADERS + ['POP100.2000','HU100.2000']
    for key in sorted(metadata['labels']):
        header.extend([key,"%s.2000" % key])
    w.writerow(header)

    query = {'sumlev': sumlev, 'metadata.STATE': state_fips }
    collection = utils.get_geography_collection()
    for geography in collection.find(query):
        row = [geography['geoid'],geography['sumlev']]

        for h in METADATA_HEADERS:
            row.append(geography['metadata'][h])

        pop2000,hu2000 = get_2000_top_level_counts(geography)
        row.extend([pop2000,hu2000])

        for key in sorted(metadata['labels']):
            try:
                row.append(geography['data']['2010'][table_id][key])
            except KeyError, e:
                if table_id.startswith('PCO'):
                    print "No data for %s at %s" % (table_id, sumlev)
                    return
                raise e # don't otherwise expect this error, so raise it...
            try:
                row.append(geography['data']['2000'][table_id][key])
            except KeyError:
                row.append('')
        w.writerow(row)
Exemplo n.º 11
0
def dedupeCanon(session_id, threshold=0.25):
    dd = worker_session.query(DedupeSession).get(session_id)
    engine = worker_session.bind
    metadata = MetaData()
    writeCanonRep(session_id)
    writeProcessedTable(session_id, 
                        proc_table_format='processed_{0}_cr', 
                        raw_table_format='cr_{0}')
    entity_table_name = 'entity_{0}_cr'.format(session_id)
    entity_table = entity_map(entity_table_name, metadata, record_id_type=String)
    entity_table.drop(bind=engine, checkfirst=True)
    entity_table.create(bind=engine)
    block_gen = blockDedupe(session_id, 
        table_name='processed_{0}_cr'.format(session_id), 
        entity_table_name='entity_{0}_cr'.format(session_id), 
        canonical=True)
    writeBlockingMap(session_id, block_gen, canonical=True)
    clustered_dupes = clusterDedupe(session_id, canonical=True, threshold=threshold)
    if clustered_dupes:
        fname = '/tmp/clusters_{0}.csv'.format(session_id)
        with open(fname, 'wb') as f:
            writer = UnicodeCSVWriter(f)
            for ids, scores in clustered_dupes:
                new_ent = unicode(uuid4())
                writer.writerow([
                    new_ent,
                    ids[0],
                    scores[0],
                    None,
                    False,
                    False,
                ])
                for id, score in zip(ids[1:], scores):
                    writer.writerow([
                        new_ent,
                        id,
                        score,
                        ids[0],
                        False,
                        False,
                    ])
        with open(fname, 'rb') as f:
            conn = engine.raw_connection()
            cur = conn.cursor()
            try:
                cur.copy_expert(''' 
                    COPY "entity_{0}_cr" (
                        entity_id,
                        record_id,
                        confidence,
                        target_record_id,
                        clustered,
                        checked_out
                    ) 
                    FROM STDIN CSV'''.format(session_id), f)
                conn.commit()
                os.remove(fname)
            except Exception, e: # pragma: no cover
                conn.rollback()
                raise e
Exemplo n.º 12
0
 def _transform(self):
     reader = UnicodeCSVReader(self.station_raw_info)
     header = [
         'wban_code', 'station_name', 'country', 'state', 'call_sign',
         'location', 'elevation', 'begin', 'end'
     ]
     reader.next()
     self.clean_station_info = StringIO()
     all_rows = []
     wbans = []
     for row in reader:
         if row[1] == '99999':
             continue
         elif row[1] in wbans:
             continue
         elif row[5] and row[6]:
             row.pop(0)
             row.pop(3)
             lat = row[5].replace('+', '')
             lon = row[6].replace('+', '')
             elev = row[7].replace('+', '')
             begin = parser.parse(row[8]).isoformat()
             end = parser.parse(row[9]).isoformat()
             row[5] = 'SRID=4326;POINT(%s %s)' % ((float(lon) / 1000),
                                                  (float(lat) / 1000))
             row[6] = float(elev) / 10
             row[7] = begin
             row[8] = end
             row.pop()
             wbans.append(row[0])
             all_rows.append(row)
     writer = UnicodeCSVWriter(self.clean_station_info)
     writer.writerow(header)
     writer.writerows(all_rows)
     self.clean_station_info.seek(0)
Exemplo n.º 13
0
    def _transform_daily(self,
                         raw_weather,
                         file_type,
                         start_line=0,
                         end_line=None):
        raw_weather.seek(0)
        reader = UnicodeCSVReader(raw_weather)
        header = reader.next()
        header = [x.strip() for x in header]

        self.clean_observations_daily = StringIO()
        writer = UnicodeCSVWriter(self.clean_observations_daily)
        out_header = [
            "wban_code", "date", "temp_max", "temp_min", "temp_avg",
            "departure_from_normal", "dewpoint_avg", "wetbulb_avg",
            "weather_types", "snowice_depth", "snowice_waterequiv", "snowfall",
            "precip_total", "station_pressure", "sealevel_pressure",
            "resultant_windspeed", "resultant_winddirection",
            "resultant_winddirection_cardinal", "avg_windspeed",
            "max5_windspeed", "max5_winddirection",
            "max5_winddirection_cardinal", "max2_windspeed",
            "max2_winddirection", "max2_winddirection_cardinal"
        ]
        writer.writerow(out_header)

        row_count = 0
        for row in reader:
            self.current_row = row
            if (row_count % 100 == 0):
                if (self.debug == True):
                    self.debug_outfile.write(
                        "\rdaily parsing: row_count=%06d" % row_count)
                    self.debug_outfile.flush()

            if (start_line > row_count):
                row_count += 1
                continue
            if ((end_line is not None) and (row_count > end_line)):
                break

            row_count += 1
            #print len(header)
            #print len(row)
            #print zip(header,row)

            if (len(row) == 0):
                continue

            row_vals = getattr(self, '_parse_%s_row_daily' % file_type)(row,
                                                                        header)

            writer.writerow(row_vals)
        return self.clean_observations_daily
Exemplo n.º 14
0
    def _transform_hourly(self,
                          raw_weather,
                          file_type,
                          start_line=0,
                          end_line=None):
        raw_weather.seek(0)
        reader = UnicodeCSVReader(raw_weather)
        header = reader.next()
        # strip leading and trailing whitespace from header (e.g. from tarfiles)
        header = [x.strip() for x in header]

        self.clean_observations_hourly = StringIO()
        writer = UnicodeCSVWriter(self.clean_observations_hourly)
        out_header = ["wban_code","datetime","old_station_type","station_type", \
                      "sky_condition","sky_condition_top","visibility",\
                      "weather_types","drybulb_fahrenheit","wetbulb_fahrenheit",\
                      "dewpoint_fahrenheit","relative_humidity",\
                      "wind_speed","wind_direction","wind_direction_cardinal",\
                      "station_pressure","sealevel_pressure","report_type",\
                      "hourly_precip"]
        writer.writerow(out_header)

        row_count = 0
        for row in reader:
            if (row_count % 1000 == 0):
                if (self.debug == True):
                    self.debug_outfile.write("\rparsing: row_count=%06d" %
                                             row_count)
                    self.debug_outfile.flush()

            if (start_line > row_count):
                row_count += 1
                continue
            if ((end_line is not None) and (row_count > end_line)):
                break

            row_count += 1

            if (len(row) == 0):
                continue

            # this calls either self._parse_zipfile_row_hourly
            # or self._parse_tarfile_row_hourly
            row_vals = getattr(self,
                               '_parse_%s_row_hourly' % file_type)(row, header)
            if (not row_vals):
                continue

            writer.writerow(row_vals)
        return self.clean_observations_hourly
Exemplo n.º 15
0
def writeCanonRep(session_id, name_pattern='cr_{0}'):
    engine = worker_session.bind
    metadata = MetaData()
    entity = Table('entity_{0}'.format(session_id), metadata,
        autoload=True, autoload_with=engine, keep_existing=True)
    proc_table = Table('processed_{0}'.format(session_id), metadata,
        autoload=True, autoload_with=engine, keep_existing=True)

    cr_cols = [Column('record_id', String, primary_key=True)]
    for col in proc_table.columns:
        if col.name != 'record_id':
            cr_cols.append(Column(col.name, col.type))
    cr = Table(name_pattern.format(session_id), metadata, *cr_cols)
    cr.drop(bind=engine, checkfirst=True)
    cr.create(bind=engine)

    cols = [entity.c.entity_id]
    col_names = [c for c in proc_table.columns.keys() if c != 'record_id']
    for name in col_names:
        cols.append(label(name, func.array_agg(getattr(proc_table.c, name))))
    rows = worker_session.query(*cols)\
        .filter(entity.c.record_id == proc_table.c.record_id)\
        .group_by(entity.c.entity_id)
    names = cr.columns.keys()
    with open('/tmp/{0}.csv'.format(name_pattern.format(session_id)), 'wb') as f:
        writer = UnicodeCSVWriter(f)
        writer.writerow(names)
        for row in rows:
            r = [row.entity_id]
            dicts = [dict(**{n:None for n in col_names}) for i in range(len(row[1]))]
            for idx, dct in enumerate(dicts):
                for name in col_names:
                    dicts[idx][name] = unicode(getattr(row, name)[idx])
            canon_form = dedupe.canonicalize(dicts)
            r.extend([canon_form[k] for k in names if canon_form.get(k) is not None])
            writer.writerow(r)
    canon_table_name = name_pattern.format(session_id)
    copy_st = 'COPY "{0}" ('.format(canon_table_name)
    for idx, name in enumerate(names):
        if idx < len(names) - 1:
            copy_st += '"{0}", '.format(name)
        else:
            copy_st += '"{0}")'.format(name)
    else:
        copy_st += "FROM STDIN WITH (FORMAT CSV, HEADER TRUE, DELIMITER ',', NULL ' ')"
    conn = engine.raw_connection()
    cur = conn.cursor()
    with open('/tmp/{0}.csv'.format(name_pattern.format(session_id)), 'rb') as f:
        cur.copy_expert(copy_st, f)
    conn.commit()
Exemplo n.º 16
0
#1/usr/bin/env python
#mechanize acts as an browser to collect html response
from mechanize import Browser
#beautifulsoup lets you strip out the html and parse it through its tree
from BeautifulSoup import BeautifulSoup
#csvkit allows you to output to a csv file easily
from csvkit.unicsv import UnicodeCSVWriter
#re handles regular expressions
import re

#open a csvfile to write to it, set a delimiter and write the header row
outfile = open("sitesdirt.csv", "w")
w = UnicodeCSVWriter(outfile, delimiter=",", encoding="Cp1252")
w.writerow(['name', 'url'])

mech = Browser()
url = "http://www.state.nj.us/nj/govinfo/county/localgov.html"
page = mech.open(url)

html = page.read()
soup = BeautifulSoup(html)

#look for the section with the id anchorSection, this is the main body of the url listings
for row in soup.findAll('div', {"id": "anchorSection"}):
    #ignore the rows with anchor tags without an href tag
    for anchor in row.findAll('a', href=True):
        name = anchor.string
        #give me whatever is in the href call, the actual url of the link
        url = anchor['href'].decode()
        record = (name, url)
        w.writerow(record)
    reader = UnicodeCSVDictReader(inp)
    comm_ids = [i['id'] for i in list(reader)]

    candidate_pattern = '/CommitteeDetailCandidates.aspx?id=%s'
    cand_scraper = CandidateScraper(url_pattern=candidate_pattern)
    cand_scraper.cache_storage = scrapelib.cache.FileCache('/cache/cache')
    cand_scraper.cache_write_only = False
    for comm_id in comm_ids:
        for cand in cand_scraper.scrape_one(comm_id):
            if cand:
                cand['CommitteeID'] = comm_id
                insert = 'insert into candidates("ID", "FullName", "FullAddress", \
                    "PartyName", "OfficeName", "CommitteeID") values (:ID, :FullName, :FullAddress, \
                    :PartyName, :OfficeName, :CommitteeID)'

                c.execute(insert, cand)
                conn.commit()
            else:
                print 'Got a 500 for %s' % comm_id
    c.execute('select * from candidates')
    header = list(map(lambda x: x[0], c.description))
    cands = c.fetchall()
    outp = StringIO()
    writer = UnicodeCSVWriter(outp)
    writer.writerow(header)
    writer.writerows(cands)
    outp.seek(0)
    k.key = 'Candidates.csv'
    k.set_contents_from_file(outp)
    k.make_public()
Exemplo n.º 18
0
 def __init__(self, f, encoding='utf-8', line_numbers=False, **kwargs):
     self.row_count = 0
     self.line_numbers = line_numbers
     UnicodeCSVWriter.__init__(self, f, encoding, lineterminator='\n', **kwargs)
Exemplo n.º 19
0
    return cmp(a_subtype, b_subtype)


if __name__ == '__main__':
    if len(sys.argv) < 2:
        sys.exit(
            'You must provide the filename for the CSV output as an argument to this script.'
        )

    FILENAME = sys.argv[1]
    with open(FILENAME, "w") as f:
        collection = utils.get_label_collection()

        labelset = collection.find_one({'dataset': 'SF1'})

        w = UnicodeCSVWriter(f)
        w.writerow([
            'table_code', 'table_desc', 'table_universe', 'table_size',
            'col_code', 'col_desc', 'indent', 'parent', 'has_children',
            'col_code_2000'
        ])
        for table_code in sorted(labelset['tables'], cmp=compare_table_codes):
            t = labelset['tables'][table_code]
            row_base = [table_code, t['name'], t['universe'], t['size']]
            for label_code in sorted(t['labels']):
                l = t['labels'][label_code]
                row = row_base[:]
                if l['parent'] is None: parent = ''
                else: parent = l['parent']
                if l['key_2000'] is None: key_2000 = ''
                else: key_2000 = l['key_2000']
Exemplo n.º 20
0
def getMatchingReady(session_id):
    addRowHash(session_id)
    cleanupTables(session_id)
    engine = worker_session.bind
    with engine.begin() as conn:
        conn.execute('DROP TABLE IF EXISTS "match_blocks_{0}"'\
            .format(session_id))
        conn.execute(''' 
            CREATE TABLE "match_blocks_{0}" (
                block_key VARCHAR, 
                record_id BIGINT
            )
            '''.format(session_id))
    sess = worker_session.query(DedupeSession).get(session_id)
    field_defs = json.loads(sess.field_defs)

    # Save Gazetteer settings
    d = dedupe.Gazetteer(field_defs)

    # Disabling canopy based predicates for now
    for definition in d.data_model.primary_fields:
        for idx, predicate in enumerate(definition.predicates):
            if predicate.type == 'TfidfPredicate':
                definition.predicates.pop(idx)

    d.readTraining(StringIO(sess.training_data))
    d.train()
    g_settings = StringIO()
    d.writeSettings(g_settings)
    g_settings.seek(0)
    sess.gaz_settings_file = g_settings.getvalue()
    worker_session.add(sess)
    worker_session.commit()

    # Write match_block table
    model_fields = list(set([f['field'] for f in field_defs]))
    fields = ', '.join(['p.{0}'.format(f) for f in model_fields])
    sel = ''' 
        SELECT 
          p.record_id, 
          {0}
        FROM "processed_{1}" AS p 
        LEFT JOIN "exact_match_{1}" AS e 
          ON p.record_id = e.match 
        WHERE e.record_id IS NULL;
        '''.format(fields, session_id)
    conn = engine.connect()
    rows = conn.execute(sel)
    data = ((getattr(row, 'record_id'), dict(zip(model_fields, row[1:]))) \
        for row in rows)
    block_gen = d.blocker(data)
    s = StringIO()
    writer = UnicodeCSVWriter(s)
    writer.writerows(block_gen)
    conn.close()
    s.seek(0)
    conn = engine.raw_connection()
    curs = conn.cursor()
    try:
        curs.copy_expert('COPY "match_blocks_{0}" FROM STDIN CSV'\
            .format(session_id), s)
        conn.commit()
    except Exception, e: # pragma: no cover
        conn.rollback()
        raise e
Exemplo n.º 21
0
            row[7] = res[0]
            row[6] = res[1]
        yield row

def make_db(fname, tblname):
    conn = sqlite3.connect(':memory:')
    t = Table.from_csv(open(fname, 'rb'), name=tblname)
    sql_table = make_table(t)
    create_st = make_create_table_statement(sql_table)
    print create_st
    insert = sql_table.insert()
    curs = conn.cursor()
    curs.execute(create_st)
    headers = t.headers()
    print headers
    rows = [dict(zip(headers, row)) for row in t.to_rows()]
    for row in rows:
        curs.execute(str(insert), row)
    return curs

if __name__ == '__main__':
    curs = make_db('macoupin-budget-update/moucoupin-budget-department-desc.csv', 'description')
    outp = open('macoupin-budget-update/macoupin-budget-2014-update.csv', 'wb')
    writer = UnicodeCSVWriter(outp)
    with open('macoupin-budget-update/macoupin-budget.csv', 'rb') as f:
        reader = UnicodeCSVReader(f)
        headers = reader.next()
        headers.insert(1, 'Fund ID')
        writer.writerow(headers)
        writer.writerows(add_attrs(reader, curs))
def make_db(fname, tblname):
    conn = sqlite3.connect(':memory:')
    t = Table.from_csv(open(fname, 'rb'), name=tblname)
    sql_table = make_table(t)
    create_st = make_create_table_statement(sql_table)
    print create_st
    insert = sql_table.insert()
    curs = conn.cursor()
    curs.execute(create_st)
    headers = t.headers()
    print headers
    rows = [dict(zip(headers, row)) for row in t.to_rows()]
    for row in rows:
        curs.execute(str(insert), row)
    return curs


if __name__ == '__main__':
    curs = make_db(
        'macoupin-budget-update/moucoupin-budget-department-desc.csv',
        'description')
    outp = open('macoupin-budget-update/macoupin-budget-2014-update.csv', 'wb')
    writer = UnicodeCSVWriter(outp)
    with open('macoupin-budget-update/macoupin-budget.csv', 'rb') as f:
        reader = UnicodeCSVReader(f)
        headers = reader.next()
        headers.insert(1, 'Fund ID')
        writer.writerow(headers)
        writer.writerows(add_attrs(reader, curs))
Exemplo n.º 23
0
#!/usr/bin/env python
import urllib2
from datetime import date, timedelta
from BeautifulSoup import BeautifulSoup
from csvkit.unicsv import UnicodeCSVWriter

# This creates the csv file using the csvkit module and writes to it, creating the header rows
outfile = open("nicar14sched.csv", "w")
w = UnicodeCSVWriter(outfile, delimiter=",", encoding="Cp1252")
w.writerow([
    'Subject', 'Start Date', 'Start Time', 'End Date', 'End Time',
    'All Day Event', 'Description', 'Location', 'Private'
])

private = False
all_day = False

#update the URL when you reuse the script next year
url = "http://www.ire.org/conferences/nicar-2014/schedule/"

#use urllib2 to send a request to the URL and gather the html response
response = urllib2.urlopen(url)
html = response.read()

#read the html and parse it using Beautiful soup
soup = BeautifulSoup(html)

#update the date of the conference
year = 2014
month = 2
adate = 26
Exemplo n.º 24
0
def _init(conference, the_date, url):

    # update the date of the conference
    year_num = str(the_date.year)[2:]

    output_file = conference + year_num + "sched.csv"

    # This creates the csv file using the csvkit module and writes to it, creating the header rows
    outfile = open(output_file, "w")
    w = UnicodeCSVWriter(outfile, delimiter=",", encoding="utf-8")
    w.writerow(
        [
            "Topic",
            "Subject",
            "Start Date",
            "Start Time",
            "End Date",
            "End Time",
            "All Day Event",
            "Description",
            "Location",
            "Private",
        ]
    )

    private = False
    all_day = False

    # use urllib2 to send a request to the URL and gather the html response
    response = urllib2.urlopen(url)
    html = response.read()

    # read the html and parse it using Beautiful soup
    soup = BeautifulSoup(html)

    # The first day of the conference is a Wednesday, or 2, since the list starts counting at 0.
    day = 2
    days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
    d = timedelta(days=1)

    # find the "ul class 'listview pane'" which wraps around each day's schedule and parse the items in it.
    for row in soup.findAll("ul", {"class": "listview pane"}):
        for row in row.findAll("h3", {"class": "title3"}):
            name = row.find("a").string
            page = row.find("a").get("href")
            url = "http://ire.org" + page

            topic = tag_session_with_topic(name)

            speaker = name.findNext("p")
            descall = speaker.findNext("p")
            desc = descall.findNext("p").contents
            newdesc = ""
            for item in desc:
                newdesc += item.string
            desc = newdesc
            subtree = speaker.strong
            if subtree == None:
                speaker2 = None
            else:
                subtree.extract()
                speaker2 = speaker.string
                speaker2 = speaker2.strip()
            try:
                speaker2 = "Speakers: " + speaker2
            except:
                speaker2 = "Speakers TBA"
            place = row.findNext("div", {"class": "col-15 meta"}).p.string
            time = place.findNext("p").string
            if time == desc:
                desc = None
            else:
                desc = desc

            mytime = time.split("-")
            start_time = mytime[0].strip()
            if len(start_time.split()[0]) < 3:
                start_time = start_time.split()[0] + ":00:00 " + start_time.split()[1]
            else:
                start_time = start_time
            end_time = mytime[1].strip()
            if len(end_time.split()[0]) < 3:
                end_time = end_time.split()[0] + ":00:00 " + end_time.split()[1]
            else:
                end_time = end_time

            dayofweek = str(the_date)
            if desc != None and speaker2 != "Speakers: TBA":
                desc = speaker2 + " - " + desc
            elif desc != None:
                desc = desc
            else:
                desc = speaker2

            desc = desc + " | URL: " + url
            record = (topic, name, the_date, start_time, the_date, end_time, all_day, desc, place, private)

            # write the record for the single class to the csv
            w.writerow(record)
        # at the end of each day's ul item, add 1 to the day of the week and loop through it again.
        the_date = the_date + d

    # always remember to close the file at the end to save it properly
    outfile.close()
Exemplo n.º 25
0
def main():
    repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))
    products_filename = os.path.join(repo_root, "data/products.csv")
    stock_filename = os.path.join(repo_root, "data/stock.csv")

    print "Connecting to S3 bucket."
    conn = S3Connection()
    bucket = conn.get_bucket("flother")

    print "Cleaning products."
    products = bucket.get_key("vinbudin/data/Product.csv", validate=False)
    product_rows = UnicodeCSVDictReader(io.BytesIO(products.read()))
    with open(products_filename, "wb") as fh:
        products_output = UnicodeCSVWriter(fh, lineterminator="\n")
        products_output.writerow(PRODUCT_FIELDS)
        for row in sorted(product_rows, key=lambda r: r["id"]):
            products_output.writerow([row[key] for key in PRODUCT_FIELDS])

    print "Cleaning stock."
    stock = bucket.get_key("vinbudin/data/Stock.csv", validate=False)
    stock_rows = UnicodeCSVDictReader(io.BytesIO(stock.read()))
    with open(stock_filename, "wb") as fh:
        stock_output = UnicodeCSVWriter(fh, lineterminator="\n")
        stock_output.writerow(STOCK_FIELDS)
        for row in sorted(stock_rows,
                          key=lambda r: (int(r["product_id"]), r["store"])):
            stock_output.writerow([row[key] for key in STOCK_FIELDS])

    conn.close()
    print "Finished downloading from S3."

    repo = Repo(repo_root)
    repo.git.reset()
    repo.remotes.origin.pull()
    if repo.is_dirty():
        print "Changes to commit."
        repo.index.add([products_filename, stock_filename])
        repo.index.commit("Add latest inventory data")
        print "Committed locally."
        repo.remotes.origin.push()
        print "Pushed to origin."
    else:
        print "No changes to commit."
Exemplo n.º 26
0
#1/usr/bin/env python
#mechanize acts as an browser to collect html response
from mechanize import Browser
#beautifulsoup lets you strip out the html and parse it through its tree
from BeautifulSoup import BeautifulSoup
#csvkit allows you to output to a csv file easily
from csvkit.unicsv import UnicodeCSVWriter
#re handles regular expressions
import re

#open a csvfile to write to it, set a delimiter and write the header row
outfile = open("sitesdirt.csv", "w")
w = UnicodeCSVWriter(outfile,delimiter=",",encoding="Cp1252")
w.writerow(['name','url'])

mech = Browser()
url = "http://www.state.nj.us/nj/govinfo/county/localgov.html"
page = mech.open(url)

html = page.read()
soup = BeautifulSoup(html)

#look for the section with the id anchorSection, this is the main body of the url listings
for row in soup.findAll('div', {"id" : "anchorSection"}):
#ignore the rows with anchor tags without an href tag
    for anchor in row.findAll('a', href=True):
        name = anchor.string
#give me whatever is in the href call, the actual url of the link
        url = anchor['href'].decode() 
        record = (name, url)
        w.writerow(record)
Exemplo n.º 27
0
def writeBlockingMap(session_id, block_data, canonical=False):
    pk_type = Integer
    if canonical:
        session_id = '{0}_cr'.format(session_id)
        pk_type = String
    metadata = MetaData()
    engine = worker_session.bind
    bkm = Table('block_{0}'.format(session_id), metadata,
        Column('block_key', Text),
        Column('record_id', pk_type)
    )
    bkm.drop(engine, checkfirst=True)
    bkm.create(engine)
    with open('/tmp/{0}.csv'.format(session_id), 'wb') as s:
        writer = UnicodeCSVWriter(s)
        writer.writerows(block_data)
    conn = engine.raw_connection()
    cur = conn.cursor()
    with open('/tmp/{0}.csv'.format(session_id), 'rb') as s:
        cur.copy_expert('COPY "block_{0}" FROM STDIN CSV'.format(session_id), s)
    conn.commit()
    
    os.remove('/tmp/{0}.csv'.format(session_id))

    block_key_idx = Index('bk_{0}_idx'.format(session_id), bkm.c.block_key)
    block_key_idx.create(engine)

    plural_key = Table('plural_key_{0}'.format(session_id), metadata,
        Column('block_key', Text),
        Column('block_id', Integer, primary_key=True)
    )
    plural_key.drop(engine, checkfirst=True)
    plural_key.create(engine)
    bkm_sel = select([bkm.c.block_key], from_obj=bkm)\
        .group_by(bkm.c.block_key)\
        .having(func.count(bkm.c.block_key) > 1)
    pl_ins = plural_key.insert()\
        .from_select([plural_key.c.block_key], bkm_sel)
    with engine.begin() as c:
        c.execute(pl_ins)
    
    pl_key_idx = Index('pk_{0}_idx'.format(session_id), plural_key.c.block_key)
    pl_key_idx.create(engine)

    with engine.begin() as c:
        c.execute('DROP TABLE IF EXISTS "plural_block_{0}"'.format(session_id))
    pl_bk_stmt = '''
        CREATE TABLE "plural_block_{0}" AS (
            SELECT p.block_id, b.record_id 
                FROM "block_{0}" AS b
                INNER JOIN "plural_key_{0}" AS p
                USING (block_key)
            )'''.format(session_id)
    with engine.begin() as c:
        c.execute(pl_bk_stmt)
    with engine.begin() as c:
        c.execute('''
            CREATE INDEX "pl_bk_idx_{0}" 
            ON "plural_block_{0}" (record_id)'''.format(session_id)
        )
    with engine.begin() as c:
        c.execute('DROP INDEX IF EXISTS "pl_bk_id_idx_{0}"'.format(session_id))
    with engine.begin() as c:
        c.execute(''' 
            CREATE UNIQUE INDEX "pl_bk_id_idx_{0}" on "plural_block_{0}" 
            (block_id, record_id) '''.format(session_id)
        )

    cov_bks_stmt = ''' 
        CREATE TABLE "covered_{0}" AS (
            SELECT record_id, 
            string_agg(CAST(block_id AS TEXT), ',' ORDER BY block_id) 
                AS sorted_ids
            FROM "plural_block_{0}"
            GROUP BY record_id
        )
    '''.format(session_id)
    with engine.begin() as c:
        c.execute('DROP TABLE IF EXISTS "covered_{0}"'.format(session_id))
    with engine.begin() as c:
        c.execute(cov_bks_stmt)
    with engine.begin() as c:
        c.execute(''' 
            CREATE UNIQUE INDEX "cov_bks_id_idx_{0}" ON "covered_{0}" (record_id)
            '''.format(session_id)
        )

    with engine.begin() as c:
        c.execute('DROP TABLE IF EXISTS "small_cov_{0}"'.format(session_id))
    small_cov = ''' 
        CREATE TABLE "small_cov_{0}" AS (
            SELECT record_id, 
                   block_id,
                   TRIM(',' FROM split_part(sorted_ids, CAST(block_id AS TEXT), 1))
                       AS smaller_ids
            FROM "plural_block_{0}"
            INNER JOIN "covered_{0}"
            USING (record_id)
        )
    '''.format(session_id)
    with engine.begin() as c:
        c.execute(small_cov)
    with engine.begin() as c:
        c.execute('''
            CREATE INDEX "sc_idx_{0}" 
            ON "small_cov_{0}" (record_id)'''.format(session_id)
        )
    with engine.begin() as c:
        c.execute('''
            CREATE INDEX "sc_bk_idx_{0}" 
            ON "small_cov_{0}" (block_id)'''.format(session_id)
        )
Exemplo n.º 28
0
def updateEntityMap(clustered_dupes,
                    session_id,
                    raw_table=None,
                    entity_table=None):
    
    """ 
    Add to entity map table after training
    """
    fname = '/tmp/clusters_{0}.csv'.format(session_id)
    with open(fname, 'wb') as f:
        writer = UnicodeCSVWriter(f)
        for ids, scores in clustered_dupes:
            new_ent = unicode(uuid4())
            writer.writerow([
                new_ent,
                ids[0],
                scores[0],
                None,
            ])
            for id, score in zip(ids[1:], scores[1:]):
                writer.writerow([
                    new_ent,
                    id,
                    score,
                    ids[0],
                ])
    engine = worker_session.bind
    metadata = MetaData()
    if not entity_table:
        entity_table = 'entity_{0}'.format(session_id)
    entity = Table(entity_table, metadata,
        autoload=True, autoload_with=engine, keep_existing=True)
    record_id_type = entity.c.record_id.type
    temp_table = Table('temp_{0}'.format(session_id), metadata,
                       Column('entity_id', String),
                       Column('record_id', record_id_type),
                       Column('target_record_id', record_id_type),
                       Column('confidence', Float))
    temp_table.drop(bind=engine, checkfirst=True)
    temp_table.create(bind=engine)
    with open(fname, 'rb') as f:
        conn = engine.raw_connection()
        cur = conn.cursor()
        cur.copy_expert(''' 
            COPY "temp_{0}" (
                entity_id,
                record_id,
                confidence,
                target_record_id
            ) 
            FROM STDIN CSV'''.format(session_id), f)
        conn.commit()

    upd = text(''' 
        UPDATE "{0}" 
          SET entity_id = temp.entity_id, 
            confidence = temp.confidence, 
            clustered = FALSE,
            checked_out = FALSE,
            last_update = :last_update,
            target_record_id = temp.target_record_id
          FROM "temp_{1}" temp 
        WHERE "{0}".record_id = temp.record_id 
    '''.format(entity_table, session_id))
    ins = text('''
        INSERT INTO "{0}" (record_id, entity_id, confidence, clustered, checked_out, target_record_id) 
          SELECT 
            record_id, 
            entity_id, 
            confidence, 
            FALSE AS clustered, 
            FALSE AS checked_out,
            target_record_id
          FROM "temp_{1}" temp 
          LEFT JOIN (
            SELECT record_id 
            FROM "{0}"
            WHERE last_update = :last_update
          ) AS s USING(record_id) 
          WHERE s.record_id IS NULL
          RETURNING record_id
    '''.format(entity_table, session_id))
    last_update = datetime.now().replace(tzinfo=TIME_ZONE)
    with engine.begin() as c:
        c.execute(upd, last_update=last_update)
        c.execute(ins, last_update=last_update)
    temp_table.drop(bind=engine)
    os.remove(fname)
Exemplo n.º 29
0
def initializeEntityMap(session_id, fields):
    engine = worker_session.bind
    metadata = MetaData()
    create = '''
        CREATE TABLE "exact_match_{0}" AS (
          SELECT 
            s.record_id,
            UNNEST(s.members) as match
          FROM (
            SELECT 
              MIN(record_id) AS record_id, 
              (array_agg(record_id ORDER BY record_id))
                [2:array_upper(array_agg(record_id), 1)] AS members
            FROM "processed_{0}" 
            GROUP BY {1} 
            HAVING (array_length(array_agg(record_id), 1) > 1)
          ) AS s
        )
        '''.format(session_id, ', '.join(fields))
    with engine.begin() as conn:
        conn.execute('DROP TABLE IF EXISTS "exact_match_{0}"'.format(session_id))
        conn.execute(create)
    exact_table = Table('exact_match_{0}'.format(session_id), metadata,
                  autoload=True, autoload_with=engine, keep_existing=True)
    rows = worker_session.query(exact_table)
    entity_table = entity_map('entity_%s' % session_id, metadata)
    entity_table.drop(engine, checkfirst=True)
    entity_table.create(engine)
    s = StringIO()
    writer = UnicodeCSVWriter(s)
    now = datetime.now().replace(tzinfo=TIME_ZONE).isoformat()
    rows = sorted(rows, key=itemgetter(0))
    grouped = {}
    for k, g in groupby(rows, key=itemgetter(0)):
        rs = [r[1] for r in g]
        grouped[k] = rs
    for king,serfs in grouped.items():
        entity_id = unicode(uuid4())
        writer.writerow([
            king, 
            None, 
            entity_id, 
            1.0,
            'raw_{0}'.format(session_id),
            'TRUE',
            'FALSE',
            'exact',
            now,
        ])
        for serf in serfs:
            writer.writerow([
                serf,
                king,
                entity_id,
                1.0,
                'raw_{0}'.format(session_id),
                'TRUE',
                'FALSE',
                'exact',
                now,
            ])
    s.seek(0)
    conn = engine.raw_connection()
    cur = conn.cursor()
    cur.copy_expert('''
        COPY "entity_{0}" (
            record_id, 
            target_record_id, 
            entity_id, 
            confidence,
            source,
            clustered,
            checked_out,
            match_type,
            last_update
        ) 
        FROM STDIN CSV'''.format(session_id), s)
    conn.commit()
Exemplo n.º 30
0
    if a_number != b_number:
        return cmp(a_number,b_number)
    return cmp(a_subtype,b_subtype)

if __name__ == '__main__':
    if len(sys.argv) < 2:
        sys.exit('You must provide the filename for the CSV output as an argument to this script.')

    FILENAME = sys.argv[1]
    with open(FILENAME,"w") as f:
        connection = Connection()
        db = connection[config.LABELS_DB] 
        collection = db[config.LABELS_COLLECTION]

        labelset = collection.find_one({ 'dataset': 'SF1' })

        w = UnicodeCSVWriter(f)
        w.writerow(['table_code','table_desc','table_universe','table_size','col_code','col_desc','indent','parent','has_children','col_code_2000'])
        for table_code in sorted(labelset['tables'],cmp=compare_table_codes):
            t = labelset['tables'][table_code]
            row_base = [table_code,t['name'],t['universe'],t['size']]
            for label_code in sorted(t['labels']):
                l = t['labels'][label_code]
                row = row_base[:]
                if l['parent'] is None: parent = ''
                else: parent = l['parent']
                if l['key_2000'] is None: key_2000 = ''
                else: key_2000 = l['key_2000']
                row.extend([l['key'],l['text'],l['indent'],parent,l['has_children'],key_2000])
                w.writerow(row)
Exemplo n.º 31
0
#!/usr/bin/env python
import urllib2
from datetime import date, timedelta
from BeautifulSoup import BeautifulSoup
from csvkit.unicsv import UnicodeCSVWriter

# This creates the csv file using the csvkit module and writes to it, creating the header rows
outfile = open("nicar14sched.csv", "w")
w = UnicodeCSVWriter(outfile,delimiter=",",encoding="Cp1252")
w.writerow(['Subject','Start Date','Start Time','End Date','End Time','All Day Event','Description','Location','Private'])

private = False
all_day = False

#update the URL when you reuse the script next year
url = "http://www.ire.org/conferences/nicar-2014/schedule/"

#use urllib2 to send a request to the URL and gather the html response
response = urllib2.urlopen(url)
html = response.read()

#read the html and parse it using Beautiful soup
soup = BeautifulSoup(html)

#update the date of the conference
year = 2014
month = 2
adate = 26
the_date=date(year,month,adate)
d = timedelta(days=1)
Exemplo n.º 32
0
def writeCSV(fpath, output):
    with open(fpath, 'wb') as f:
        writer = UnicodeCSVWriter(f)
        writer.writerows(output)
Exemplo n.º 33
0
def writeCSV(fpath, output):
    with open(fpath, 'wb') as f:
        writer = UnicodeCSVWriter(f)
        writer.writerows(output)
                insert = sql_table.insert()
                headers = t.headers()
                rows = [dict(zip(headers, row)) for row in t.to_rows()]
                for row in rows:
                    c.execute(str(insert), row)
                conn.commit()
            else:
                print 'Already saved report %s' % report_data['detail_url']
    c.execute('select date_filed from reports order by date_filed limit 1')
    oldest_year = parser.parse(c.fetchone()[0]).year
    c.execute(
        'select date_filed from reports order by date_filed desc limit 1')
    newest_year = parser.parse(c.fetchone()[0]).year
    c.execute('select * from reports limit 1')
    header = list(map(lambda x: x[0], c.description))
    for year in range(oldest_year, newest_year + 1):
        oldest_date = '%s-01-01' % year
        newest_date = '%s-12-31' % year
        c.execute(
            'select * from reports where date_filed >= ? and date_filed <= ?',
            (oldest_date, newest_date))
        rows = c.fetchall()
        outp = StringIO()
        writer = UnicodeCSVWriter(outp)
        writer.writerow(header)
        writer.writerows(rows)
        outp.seek(0)
        k.key = 'Reports/%s.csv' % year
        k.set_contents_from_file(outp)
        k.make_public()