def _transform(self): reader = UnicodeCSVReader(self.station_raw_info) header = ['wban_code', 'station_name', 'country', 'state', 'call_sign', 'location', 'elevation', 'begin', 'end'] reader.next() self.clean_station_info = StringIO() all_rows = [] wbans = [] for row in reader: if row[1] == '99999': continue elif row[1] in wbans: continue elif row[5] and row[6]: row.pop(0) row.pop(3) lat = row[5].replace('+', '') lon = row[6].replace('+', '') elev = row[7].replace('+', '') begin = parser.parse(row[8]).isoformat() end = parser.parse(row[9]).isoformat() row[5] = 'SRID=4326;POINT(%s %s)' % ((float(lon) / 1000), (float(lat) / 1000)) row[6] = float(elev) / 10 row[7] = begin row[8] = end row.pop() wbans.append(row[0]) all_rows.append(row) writer = UnicodeCSVWriter(self.clean_station_info) writer.writerow(header) writer.writerows(all_rows) self.clean_station_info.seek(0)
def clean(f): reader = UnicodeCSVReader(f) good = [] bad = [] header = reader.next() for row in reader: try: row[0] = int(row[0]) row[3] = int(row[3]) row[5] = int(row[5]) row[7] = int(row[7]) row[4] = row[4].replace(',', '') if len(row) == 12: good.append(row) else: bad.append(row) except (TypeError, ValueError): bad.append(row) goodf = open('data/trips_cleaned.csv', 'wb') badf = open('data/trips_dirty.csv', 'wb') goodwriter = UnicodeCSVWriter(goodf) goodwriter.writerow(header) goodwriter.writerows(good) badwriter = UnicodeCSVWriter(badf) badwriter.writerow(header) badwriter.writerows(bad) goodf.close() badf.close()
def makeRawTable(contents): inp = StringIO(contents) reader = UnicodeCSVReader(inp) header = reader.next() header = [slugify(h) for h in header] outp = StringIO() writer = UnicodeCSVWriter(outp) writer.writerow(header) writer.writerows([[preProcess(unicode(i)) for i in r] for r in reader]) outp.seek(0) conn = sqlite3.connect(':memory:') t = Table.from_csv(outp, name='raw_table', blanks_as_nulls=False, infer_types=False) sql_table = make_table(t) create_st = make_create_table_statement(sql_table) parts = create_st.split('raw_table (') create_st = '{0} raw_table ( record_id INTEGER PRIMARY KEY,{1}'.format(*parts) insert = sql_table.insert() curs = conn.cursor() curs.execute(create_st) rows = [dict(zip(header, row)) for row in t.to_rows()] for row in rows: curs.execute(str(insert), row) dump = StringIO() for line in conn.iterdump(): dump.write(unidecode(line)) dump.seek(0) return dump.getvalue(), header
def __init__(self, f, encoding='utf-8', line_numbers=False, **kwargs): self.row_count = 0 self.line_numbers = line_numbers UnicodeCSVWriter.__init__(self, f, encoding, lineterminator='\n', **kwargs)
def writerow(self, row): if self.line_numbers: row = list(row) self._append_line_number(row) # Convert embedded Mac line endings to unix style line endings so they get quoted row = [i.replace('\r', '\n') if isinstance(i, basestring) else i for i in row] UnicodeCSVWriter.writerow(self, row)
def write_csv(filename,data,*headers): f = open(filename,'w') writer = UnicodeCSVWriter(f) writer.writerow(headers) for datum in data: row = [] for h in headers: row.append(datum.get(h,'')) writer.writerow(row)
def writerow(self, row): if self.line_numbers: row = list(row) self._append_line_number(row) # Convert embedded Mac line endings to unix style line endings so they get quoted row = [ i.replace('\r', '\n') if isinstance(i, basestring) else i for i in row ] UnicodeCSVWriter.writerow(self, row)
def _transform_daily(self, raw_weather, file_type, start_line=0, end_line=None): raw_weather.seek(0) reader = UnicodeCSVReader(raw_weather) header = reader.next() header = [x.strip() for x in header] self.clean_observations_daily = StringIO() writer = UnicodeCSVWriter(self.clean_observations_daily) out_header = ["wban_code","date","temp_max","temp_min", "temp_avg","departure_from_normal", "dewpoint_avg", "wetbulb_avg","weather_types", "snowice_depth", "snowice_waterequiv", "snowfall","precip_total", "station_pressure", "sealevel_pressure", "resultant_windspeed", "resultant_winddirection", "resultant_winddirection_cardinal", "avg_windspeed", "max5_windspeed", "max5_winddirection","max5_winddirection_cardinal", "max2_windspeed", "max2_winddirection","max2_winddirection_cardinal"] writer.writerow(out_header) row_count = 0 for row in reader: self.current_row = row if (row_count % 100 == 0): if (self.debug == True): self.debug_outfile.write("\rdaily parsing: row_count=%06d" % row_count) self.debug_outfile.flush() if (start_line > row_count): row_count +=1 continue if ((end_line is not None) and (row_count > end_line)): break row_count += 1 #print len(header) #print len(row) #print zip(header,row) if (len(row) == 0): continue row_vals = getattr(self, '_parse_%s_row_daily' % file_type)(row, header) writer.writerow(row_vals) return self.clean_observations_daily
def _transform_hourly(self, raw_weather, file_type, start_line=0, end_line=None): raw_weather.seek(0) reader = UnicodeCSVReader(raw_weather) header= reader.next() # strip leading and trailing whitespace from header (e.g. from tarfiles) header = [x.strip() for x in header] self.clean_observations_hourly = StringIO() writer = UnicodeCSVWriter(self.clean_observations_hourly) out_header = ["wban_code","datetime","old_station_type","station_type", \ "sky_condition","sky_condition_top","visibility",\ "weather_types","drybulb_fahrenheit","wetbulb_fahrenheit",\ "dewpoint_fahrenheit","relative_humidity",\ "wind_speed","wind_direction","wind_direction_cardinal",\ "station_pressure","sealevel_pressure","report_type",\ "hourly_precip"] writer.writerow(out_header) row_count = 0 for row in reader: if (row_count % 1000 == 0): if (self.debug==True): self.debug_outfile.write( "\rparsing: row_count=%06d" % row_count) self.debug_outfile.flush() if (start_line > row_count): row_count +=1 continue if ((end_line is not None) and (row_count > end_line)): break row_count += 1 if (len(row) == 0): continue # this calls either self._parse_zipfile_row_hourly # or self._parse_tarfile_row_hourly row_vals = getattr(self, '_parse_%s_row_hourly' % file_type)(row, header) if (not row_vals): continue writer.writerow(row_vals) return self.clean_observations_hourly
def write_table_data(flo, state_fips, sumlev, table_id): """Given a File-Like Object, write a table to it""" w = UnicodeCSVWriter(flo) metadata = fetch_table_label(table_id) header = ['GEOID', 'SUMLEV'] + METADATA_HEADERS + ['POP100.2000','HU100.2000'] for key in sorted(metadata['labels']): header.extend([key,"%s.2000" % key]) w.writerow(header) query = {'sumlev': sumlev, 'metadata.STATE': state_fips } collection = utils.get_geography_collection() for geography in collection.find(query): row = [geography['geoid'],geography['sumlev']] for h in METADATA_HEADERS: row.append(geography['metadata'][h]) pop2000,hu2000 = get_2000_top_level_counts(geography) row.extend([pop2000,hu2000]) for key in sorted(metadata['labels']): try: row.append(geography['data']['2010'][table_id][key]) except KeyError, e: if table_id.startswith('PCO'): print "No data for %s at %s" % (table_id, sumlev) return raise e # don't otherwise expect this error, so raise it... try: row.append(geography['data']['2000'][table_id][key]) except KeyError: row.append('') w.writerow(row)
def dedupeCanon(session_id, threshold=0.25): dd = worker_session.query(DedupeSession).get(session_id) engine = worker_session.bind metadata = MetaData() writeCanonRep(session_id) writeProcessedTable(session_id, proc_table_format='processed_{0}_cr', raw_table_format='cr_{0}') entity_table_name = 'entity_{0}_cr'.format(session_id) entity_table = entity_map(entity_table_name, metadata, record_id_type=String) entity_table.drop(bind=engine, checkfirst=True) entity_table.create(bind=engine) block_gen = blockDedupe(session_id, table_name='processed_{0}_cr'.format(session_id), entity_table_name='entity_{0}_cr'.format(session_id), canonical=True) writeBlockingMap(session_id, block_gen, canonical=True) clustered_dupes = clusterDedupe(session_id, canonical=True, threshold=threshold) if clustered_dupes: fname = '/tmp/clusters_{0}.csv'.format(session_id) with open(fname, 'wb') as f: writer = UnicodeCSVWriter(f) for ids, scores in clustered_dupes: new_ent = unicode(uuid4()) writer.writerow([ new_ent, ids[0], scores[0], None, False, False, ]) for id, score in zip(ids[1:], scores): writer.writerow([ new_ent, id, score, ids[0], False, False, ]) with open(fname, 'rb') as f: conn = engine.raw_connection() cur = conn.cursor() try: cur.copy_expert(''' COPY "entity_{0}_cr" ( entity_id, record_id, confidence, target_record_id, clustered, checked_out ) FROM STDIN CSV'''.format(session_id), f) conn.commit() os.remove(fname) except Exception, e: # pragma: no cover conn.rollback() raise e
def _transform(self): reader = UnicodeCSVReader(self.station_raw_info) header = [ 'wban_code', 'station_name', 'country', 'state', 'call_sign', 'location', 'elevation', 'begin', 'end' ] reader.next() self.clean_station_info = StringIO() all_rows = [] wbans = [] for row in reader: if row[1] == '99999': continue elif row[1] in wbans: continue elif row[5] and row[6]: row.pop(0) row.pop(3) lat = row[5].replace('+', '') lon = row[6].replace('+', '') elev = row[7].replace('+', '') begin = parser.parse(row[8]).isoformat() end = parser.parse(row[9]).isoformat() row[5] = 'SRID=4326;POINT(%s %s)' % ((float(lon) / 1000), (float(lat) / 1000)) row[6] = float(elev) / 10 row[7] = begin row[8] = end row.pop() wbans.append(row[0]) all_rows.append(row) writer = UnicodeCSVWriter(self.clean_station_info) writer.writerow(header) writer.writerows(all_rows) self.clean_station_info.seek(0)
def _transform_daily(self, raw_weather, file_type, start_line=0, end_line=None): raw_weather.seek(0) reader = UnicodeCSVReader(raw_weather) header = reader.next() header = [x.strip() for x in header] self.clean_observations_daily = StringIO() writer = UnicodeCSVWriter(self.clean_observations_daily) out_header = [ "wban_code", "date", "temp_max", "temp_min", "temp_avg", "departure_from_normal", "dewpoint_avg", "wetbulb_avg", "weather_types", "snowice_depth", "snowice_waterequiv", "snowfall", "precip_total", "station_pressure", "sealevel_pressure", "resultant_windspeed", "resultant_winddirection", "resultant_winddirection_cardinal", "avg_windspeed", "max5_windspeed", "max5_winddirection", "max5_winddirection_cardinal", "max2_windspeed", "max2_winddirection", "max2_winddirection_cardinal" ] writer.writerow(out_header) row_count = 0 for row in reader: self.current_row = row if (row_count % 100 == 0): if (self.debug == True): self.debug_outfile.write( "\rdaily parsing: row_count=%06d" % row_count) self.debug_outfile.flush() if (start_line > row_count): row_count += 1 continue if ((end_line is not None) and (row_count > end_line)): break row_count += 1 #print len(header) #print len(row) #print zip(header,row) if (len(row) == 0): continue row_vals = getattr(self, '_parse_%s_row_daily' % file_type)(row, header) writer.writerow(row_vals) return self.clean_observations_daily
def _transform_hourly(self, raw_weather, file_type, start_line=0, end_line=None): raw_weather.seek(0) reader = UnicodeCSVReader(raw_weather) header = reader.next() # strip leading and trailing whitespace from header (e.g. from tarfiles) header = [x.strip() for x in header] self.clean_observations_hourly = StringIO() writer = UnicodeCSVWriter(self.clean_observations_hourly) out_header = ["wban_code","datetime","old_station_type","station_type", \ "sky_condition","sky_condition_top","visibility",\ "weather_types","drybulb_fahrenheit","wetbulb_fahrenheit",\ "dewpoint_fahrenheit","relative_humidity",\ "wind_speed","wind_direction","wind_direction_cardinal",\ "station_pressure","sealevel_pressure","report_type",\ "hourly_precip"] writer.writerow(out_header) row_count = 0 for row in reader: if (row_count % 1000 == 0): if (self.debug == True): self.debug_outfile.write("\rparsing: row_count=%06d" % row_count) self.debug_outfile.flush() if (start_line > row_count): row_count += 1 continue if ((end_line is not None) and (row_count > end_line)): break row_count += 1 if (len(row) == 0): continue # this calls either self._parse_zipfile_row_hourly # or self._parse_tarfile_row_hourly row_vals = getattr(self, '_parse_%s_row_hourly' % file_type)(row, header) if (not row_vals): continue writer.writerow(row_vals) return self.clean_observations_hourly
def writeCanonRep(session_id, name_pattern='cr_{0}'): engine = worker_session.bind metadata = MetaData() entity = Table('entity_{0}'.format(session_id), metadata, autoload=True, autoload_with=engine, keep_existing=True) proc_table = Table('processed_{0}'.format(session_id), metadata, autoload=True, autoload_with=engine, keep_existing=True) cr_cols = [Column('record_id', String, primary_key=True)] for col in proc_table.columns: if col.name != 'record_id': cr_cols.append(Column(col.name, col.type)) cr = Table(name_pattern.format(session_id), metadata, *cr_cols) cr.drop(bind=engine, checkfirst=True) cr.create(bind=engine) cols = [entity.c.entity_id] col_names = [c for c in proc_table.columns.keys() if c != 'record_id'] for name in col_names: cols.append(label(name, func.array_agg(getattr(proc_table.c, name)))) rows = worker_session.query(*cols)\ .filter(entity.c.record_id == proc_table.c.record_id)\ .group_by(entity.c.entity_id) names = cr.columns.keys() with open('/tmp/{0}.csv'.format(name_pattern.format(session_id)), 'wb') as f: writer = UnicodeCSVWriter(f) writer.writerow(names) for row in rows: r = [row.entity_id] dicts = [dict(**{n:None for n in col_names}) for i in range(len(row[1]))] for idx, dct in enumerate(dicts): for name in col_names: dicts[idx][name] = unicode(getattr(row, name)[idx]) canon_form = dedupe.canonicalize(dicts) r.extend([canon_form[k] for k in names if canon_form.get(k) is not None]) writer.writerow(r) canon_table_name = name_pattern.format(session_id) copy_st = 'COPY "{0}" ('.format(canon_table_name) for idx, name in enumerate(names): if idx < len(names) - 1: copy_st += '"{0}", '.format(name) else: copy_st += '"{0}")'.format(name) else: copy_st += "FROM STDIN WITH (FORMAT CSV, HEADER TRUE, DELIMITER ',', NULL ' ')" conn = engine.raw_connection() cur = conn.cursor() with open('/tmp/{0}.csv'.format(name_pattern.format(session_id)), 'rb') as f: cur.copy_expert(copy_st, f) conn.commit()
#1/usr/bin/env python #mechanize acts as an browser to collect html response from mechanize import Browser #beautifulsoup lets you strip out the html and parse it through its tree from BeautifulSoup import BeautifulSoup #csvkit allows you to output to a csv file easily from csvkit.unicsv import UnicodeCSVWriter #re handles regular expressions import re #open a csvfile to write to it, set a delimiter and write the header row outfile = open("sitesdirt.csv", "w") w = UnicodeCSVWriter(outfile, delimiter=",", encoding="Cp1252") w.writerow(['name', 'url']) mech = Browser() url = "http://www.state.nj.us/nj/govinfo/county/localgov.html" page = mech.open(url) html = page.read() soup = BeautifulSoup(html) #look for the section with the id anchorSection, this is the main body of the url listings for row in soup.findAll('div', {"id": "anchorSection"}): #ignore the rows with anchor tags without an href tag for anchor in row.findAll('a', href=True): name = anchor.string #give me whatever is in the href call, the actual url of the link url = anchor['href'].decode() record = (name, url) w.writerow(record)
reader = UnicodeCSVDictReader(inp) comm_ids = [i['id'] for i in list(reader)] candidate_pattern = '/CommitteeDetailCandidates.aspx?id=%s' cand_scraper = CandidateScraper(url_pattern=candidate_pattern) cand_scraper.cache_storage = scrapelib.cache.FileCache('/cache/cache') cand_scraper.cache_write_only = False for comm_id in comm_ids: for cand in cand_scraper.scrape_one(comm_id): if cand: cand['CommitteeID'] = comm_id insert = 'insert into candidates("ID", "FullName", "FullAddress", \ "PartyName", "OfficeName", "CommitteeID") values (:ID, :FullName, :FullAddress, \ :PartyName, :OfficeName, :CommitteeID)' c.execute(insert, cand) conn.commit() else: print 'Got a 500 for %s' % comm_id c.execute('select * from candidates') header = list(map(lambda x: x[0], c.description)) cands = c.fetchall() outp = StringIO() writer = UnicodeCSVWriter(outp) writer.writerow(header) writer.writerows(cands) outp.seek(0) k.key = 'Candidates.csv' k.set_contents_from_file(outp) k.make_public()
return cmp(a_subtype, b_subtype) if __name__ == '__main__': if len(sys.argv) < 2: sys.exit( 'You must provide the filename for the CSV output as an argument to this script.' ) FILENAME = sys.argv[1] with open(FILENAME, "w") as f: collection = utils.get_label_collection() labelset = collection.find_one({'dataset': 'SF1'}) w = UnicodeCSVWriter(f) w.writerow([ 'table_code', 'table_desc', 'table_universe', 'table_size', 'col_code', 'col_desc', 'indent', 'parent', 'has_children', 'col_code_2000' ]) for table_code in sorted(labelset['tables'], cmp=compare_table_codes): t = labelset['tables'][table_code] row_base = [table_code, t['name'], t['universe'], t['size']] for label_code in sorted(t['labels']): l = t['labels'][label_code] row = row_base[:] if l['parent'] is None: parent = '' else: parent = l['parent'] if l['key_2000'] is None: key_2000 = '' else: key_2000 = l['key_2000']
def getMatchingReady(session_id): addRowHash(session_id) cleanupTables(session_id) engine = worker_session.bind with engine.begin() as conn: conn.execute('DROP TABLE IF EXISTS "match_blocks_{0}"'\ .format(session_id)) conn.execute(''' CREATE TABLE "match_blocks_{0}" ( block_key VARCHAR, record_id BIGINT ) '''.format(session_id)) sess = worker_session.query(DedupeSession).get(session_id) field_defs = json.loads(sess.field_defs) # Save Gazetteer settings d = dedupe.Gazetteer(field_defs) # Disabling canopy based predicates for now for definition in d.data_model.primary_fields: for idx, predicate in enumerate(definition.predicates): if predicate.type == 'TfidfPredicate': definition.predicates.pop(idx) d.readTraining(StringIO(sess.training_data)) d.train() g_settings = StringIO() d.writeSettings(g_settings) g_settings.seek(0) sess.gaz_settings_file = g_settings.getvalue() worker_session.add(sess) worker_session.commit() # Write match_block table model_fields = list(set([f['field'] for f in field_defs])) fields = ', '.join(['p.{0}'.format(f) for f in model_fields]) sel = ''' SELECT p.record_id, {0} FROM "processed_{1}" AS p LEFT JOIN "exact_match_{1}" AS e ON p.record_id = e.match WHERE e.record_id IS NULL; '''.format(fields, session_id) conn = engine.connect() rows = conn.execute(sel) data = ((getattr(row, 'record_id'), dict(zip(model_fields, row[1:]))) \ for row in rows) block_gen = d.blocker(data) s = StringIO() writer = UnicodeCSVWriter(s) writer.writerows(block_gen) conn.close() s.seek(0) conn = engine.raw_connection() curs = conn.cursor() try: curs.copy_expert('COPY "match_blocks_{0}" FROM STDIN CSV'\ .format(session_id), s) conn.commit() except Exception, e: # pragma: no cover conn.rollback() raise e
row[7] = res[0] row[6] = res[1] yield row def make_db(fname, tblname): conn = sqlite3.connect(':memory:') t = Table.from_csv(open(fname, 'rb'), name=tblname) sql_table = make_table(t) create_st = make_create_table_statement(sql_table) print create_st insert = sql_table.insert() curs = conn.cursor() curs.execute(create_st) headers = t.headers() print headers rows = [dict(zip(headers, row)) for row in t.to_rows()] for row in rows: curs.execute(str(insert), row) return curs if __name__ == '__main__': curs = make_db('macoupin-budget-update/moucoupin-budget-department-desc.csv', 'description') outp = open('macoupin-budget-update/macoupin-budget-2014-update.csv', 'wb') writer = UnicodeCSVWriter(outp) with open('macoupin-budget-update/macoupin-budget.csv', 'rb') as f: reader = UnicodeCSVReader(f) headers = reader.next() headers.insert(1, 'Fund ID') writer.writerow(headers) writer.writerows(add_attrs(reader, curs))
def make_db(fname, tblname): conn = sqlite3.connect(':memory:') t = Table.from_csv(open(fname, 'rb'), name=tblname) sql_table = make_table(t) create_st = make_create_table_statement(sql_table) print create_st insert = sql_table.insert() curs = conn.cursor() curs.execute(create_st) headers = t.headers() print headers rows = [dict(zip(headers, row)) for row in t.to_rows()] for row in rows: curs.execute(str(insert), row) return curs if __name__ == '__main__': curs = make_db( 'macoupin-budget-update/moucoupin-budget-department-desc.csv', 'description') outp = open('macoupin-budget-update/macoupin-budget-2014-update.csv', 'wb') writer = UnicodeCSVWriter(outp) with open('macoupin-budget-update/macoupin-budget.csv', 'rb') as f: reader = UnicodeCSVReader(f) headers = reader.next() headers.insert(1, 'Fund ID') writer.writerow(headers) writer.writerows(add_attrs(reader, curs))
#!/usr/bin/env python import urllib2 from datetime import date, timedelta from BeautifulSoup import BeautifulSoup from csvkit.unicsv import UnicodeCSVWriter # This creates the csv file using the csvkit module and writes to it, creating the header rows outfile = open("nicar14sched.csv", "w") w = UnicodeCSVWriter(outfile, delimiter=",", encoding="Cp1252") w.writerow([ 'Subject', 'Start Date', 'Start Time', 'End Date', 'End Time', 'All Day Event', 'Description', 'Location', 'Private' ]) private = False all_day = False #update the URL when you reuse the script next year url = "http://www.ire.org/conferences/nicar-2014/schedule/" #use urllib2 to send a request to the URL and gather the html response response = urllib2.urlopen(url) html = response.read() #read the html and parse it using Beautiful soup soup = BeautifulSoup(html) #update the date of the conference year = 2014 month = 2 adate = 26
def _init(conference, the_date, url): # update the date of the conference year_num = str(the_date.year)[2:] output_file = conference + year_num + "sched.csv" # This creates the csv file using the csvkit module and writes to it, creating the header rows outfile = open(output_file, "w") w = UnicodeCSVWriter(outfile, delimiter=",", encoding="utf-8") w.writerow( [ "Topic", "Subject", "Start Date", "Start Time", "End Date", "End Time", "All Day Event", "Description", "Location", "Private", ] ) private = False all_day = False # use urllib2 to send a request to the URL and gather the html response response = urllib2.urlopen(url) html = response.read() # read the html and parse it using Beautiful soup soup = BeautifulSoup(html) # The first day of the conference is a Wednesday, or 2, since the list starts counting at 0. day = 2 days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"] d = timedelta(days=1) # find the "ul class 'listview pane'" which wraps around each day's schedule and parse the items in it. for row in soup.findAll("ul", {"class": "listview pane"}): for row in row.findAll("h3", {"class": "title3"}): name = row.find("a").string page = row.find("a").get("href") url = "http://ire.org" + page topic = tag_session_with_topic(name) speaker = name.findNext("p") descall = speaker.findNext("p") desc = descall.findNext("p").contents newdesc = "" for item in desc: newdesc += item.string desc = newdesc subtree = speaker.strong if subtree == None: speaker2 = None else: subtree.extract() speaker2 = speaker.string speaker2 = speaker2.strip() try: speaker2 = "Speakers: " + speaker2 except: speaker2 = "Speakers TBA" place = row.findNext("div", {"class": "col-15 meta"}).p.string time = place.findNext("p").string if time == desc: desc = None else: desc = desc mytime = time.split("-") start_time = mytime[0].strip() if len(start_time.split()[0]) < 3: start_time = start_time.split()[0] + ":00:00 " + start_time.split()[1] else: start_time = start_time end_time = mytime[1].strip() if len(end_time.split()[0]) < 3: end_time = end_time.split()[0] + ":00:00 " + end_time.split()[1] else: end_time = end_time dayofweek = str(the_date) if desc != None and speaker2 != "Speakers: TBA": desc = speaker2 + " - " + desc elif desc != None: desc = desc else: desc = speaker2 desc = desc + " | URL: " + url record = (topic, name, the_date, start_time, the_date, end_time, all_day, desc, place, private) # write the record for the single class to the csv w.writerow(record) # at the end of each day's ul item, add 1 to the day of the week and loop through it again. the_date = the_date + d # always remember to close the file at the end to save it properly outfile.close()
def main(): repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../")) products_filename = os.path.join(repo_root, "data/products.csv") stock_filename = os.path.join(repo_root, "data/stock.csv") print "Connecting to S3 bucket." conn = S3Connection() bucket = conn.get_bucket("flother") print "Cleaning products." products = bucket.get_key("vinbudin/data/Product.csv", validate=False) product_rows = UnicodeCSVDictReader(io.BytesIO(products.read())) with open(products_filename, "wb") as fh: products_output = UnicodeCSVWriter(fh, lineterminator="\n") products_output.writerow(PRODUCT_FIELDS) for row in sorted(product_rows, key=lambda r: r["id"]): products_output.writerow([row[key] for key in PRODUCT_FIELDS]) print "Cleaning stock." stock = bucket.get_key("vinbudin/data/Stock.csv", validate=False) stock_rows = UnicodeCSVDictReader(io.BytesIO(stock.read())) with open(stock_filename, "wb") as fh: stock_output = UnicodeCSVWriter(fh, lineterminator="\n") stock_output.writerow(STOCK_FIELDS) for row in sorted(stock_rows, key=lambda r: (int(r["product_id"]), r["store"])): stock_output.writerow([row[key] for key in STOCK_FIELDS]) conn.close() print "Finished downloading from S3." repo = Repo(repo_root) repo.git.reset() repo.remotes.origin.pull() if repo.is_dirty(): print "Changes to commit." repo.index.add([products_filename, stock_filename]) repo.index.commit("Add latest inventory data") print "Committed locally." repo.remotes.origin.push() print "Pushed to origin." else: print "No changes to commit."
#1/usr/bin/env python #mechanize acts as an browser to collect html response from mechanize import Browser #beautifulsoup lets you strip out the html and parse it through its tree from BeautifulSoup import BeautifulSoup #csvkit allows you to output to a csv file easily from csvkit.unicsv import UnicodeCSVWriter #re handles regular expressions import re #open a csvfile to write to it, set a delimiter and write the header row outfile = open("sitesdirt.csv", "w") w = UnicodeCSVWriter(outfile,delimiter=",",encoding="Cp1252") w.writerow(['name','url']) mech = Browser() url = "http://www.state.nj.us/nj/govinfo/county/localgov.html" page = mech.open(url) html = page.read() soup = BeautifulSoup(html) #look for the section with the id anchorSection, this is the main body of the url listings for row in soup.findAll('div', {"id" : "anchorSection"}): #ignore the rows with anchor tags without an href tag for anchor in row.findAll('a', href=True): name = anchor.string #give me whatever is in the href call, the actual url of the link url = anchor['href'].decode() record = (name, url) w.writerow(record)
def writeBlockingMap(session_id, block_data, canonical=False): pk_type = Integer if canonical: session_id = '{0}_cr'.format(session_id) pk_type = String metadata = MetaData() engine = worker_session.bind bkm = Table('block_{0}'.format(session_id), metadata, Column('block_key', Text), Column('record_id', pk_type) ) bkm.drop(engine, checkfirst=True) bkm.create(engine) with open('/tmp/{0}.csv'.format(session_id), 'wb') as s: writer = UnicodeCSVWriter(s) writer.writerows(block_data) conn = engine.raw_connection() cur = conn.cursor() with open('/tmp/{0}.csv'.format(session_id), 'rb') as s: cur.copy_expert('COPY "block_{0}" FROM STDIN CSV'.format(session_id), s) conn.commit() os.remove('/tmp/{0}.csv'.format(session_id)) block_key_idx = Index('bk_{0}_idx'.format(session_id), bkm.c.block_key) block_key_idx.create(engine) plural_key = Table('plural_key_{0}'.format(session_id), metadata, Column('block_key', Text), Column('block_id', Integer, primary_key=True) ) plural_key.drop(engine, checkfirst=True) plural_key.create(engine) bkm_sel = select([bkm.c.block_key], from_obj=bkm)\ .group_by(bkm.c.block_key)\ .having(func.count(bkm.c.block_key) > 1) pl_ins = plural_key.insert()\ .from_select([plural_key.c.block_key], bkm_sel) with engine.begin() as c: c.execute(pl_ins) pl_key_idx = Index('pk_{0}_idx'.format(session_id), plural_key.c.block_key) pl_key_idx.create(engine) with engine.begin() as c: c.execute('DROP TABLE IF EXISTS "plural_block_{0}"'.format(session_id)) pl_bk_stmt = ''' CREATE TABLE "plural_block_{0}" AS ( SELECT p.block_id, b.record_id FROM "block_{0}" AS b INNER JOIN "plural_key_{0}" AS p USING (block_key) )'''.format(session_id) with engine.begin() as c: c.execute(pl_bk_stmt) with engine.begin() as c: c.execute(''' CREATE INDEX "pl_bk_idx_{0}" ON "plural_block_{0}" (record_id)'''.format(session_id) ) with engine.begin() as c: c.execute('DROP INDEX IF EXISTS "pl_bk_id_idx_{0}"'.format(session_id)) with engine.begin() as c: c.execute(''' CREATE UNIQUE INDEX "pl_bk_id_idx_{0}" on "plural_block_{0}" (block_id, record_id) '''.format(session_id) ) cov_bks_stmt = ''' CREATE TABLE "covered_{0}" AS ( SELECT record_id, string_agg(CAST(block_id AS TEXT), ',' ORDER BY block_id) AS sorted_ids FROM "plural_block_{0}" GROUP BY record_id ) '''.format(session_id) with engine.begin() as c: c.execute('DROP TABLE IF EXISTS "covered_{0}"'.format(session_id)) with engine.begin() as c: c.execute(cov_bks_stmt) with engine.begin() as c: c.execute(''' CREATE UNIQUE INDEX "cov_bks_id_idx_{0}" ON "covered_{0}" (record_id) '''.format(session_id) ) with engine.begin() as c: c.execute('DROP TABLE IF EXISTS "small_cov_{0}"'.format(session_id)) small_cov = ''' CREATE TABLE "small_cov_{0}" AS ( SELECT record_id, block_id, TRIM(',' FROM split_part(sorted_ids, CAST(block_id AS TEXT), 1)) AS smaller_ids FROM "plural_block_{0}" INNER JOIN "covered_{0}" USING (record_id) ) '''.format(session_id) with engine.begin() as c: c.execute(small_cov) with engine.begin() as c: c.execute(''' CREATE INDEX "sc_idx_{0}" ON "small_cov_{0}" (record_id)'''.format(session_id) ) with engine.begin() as c: c.execute(''' CREATE INDEX "sc_bk_idx_{0}" ON "small_cov_{0}" (block_id)'''.format(session_id) )
def updateEntityMap(clustered_dupes, session_id, raw_table=None, entity_table=None): """ Add to entity map table after training """ fname = '/tmp/clusters_{0}.csv'.format(session_id) with open(fname, 'wb') as f: writer = UnicodeCSVWriter(f) for ids, scores in clustered_dupes: new_ent = unicode(uuid4()) writer.writerow([ new_ent, ids[0], scores[0], None, ]) for id, score in zip(ids[1:], scores[1:]): writer.writerow([ new_ent, id, score, ids[0], ]) engine = worker_session.bind metadata = MetaData() if not entity_table: entity_table = 'entity_{0}'.format(session_id) entity = Table(entity_table, metadata, autoload=True, autoload_with=engine, keep_existing=True) record_id_type = entity.c.record_id.type temp_table = Table('temp_{0}'.format(session_id), metadata, Column('entity_id', String), Column('record_id', record_id_type), Column('target_record_id', record_id_type), Column('confidence', Float)) temp_table.drop(bind=engine, checkfirst=True) temp_table.create(bind=engine) with open(fname, 'rb') as f: conn = engine.raw_connection() cur = conn.cursor() cur.copy_expert(''' COPY "temp_{0}" ( entity_id, record_id, confidence, target_record_id ) FROM STDIN CSV'''.format(session_id), f) conn.commit() upd = text(''' UPDATE "{0}" SET entity_id = temp.entity_id, confidence = temp.confidence, clustered = FALSE, checked_out = FALSE, last_update = :last_update, target_record_id = temp.target_record_id FROM "temp_{1}" temp WHERE "{0}".record_id = temp.record_id '''.format(entity_table, session_id)) ins = text(''' INSERT INTO "{0}" (record_id, entity_id, confidence, clustered, checked_out, target_record_id) SELECT record_id, entity_id, confidence, FALSE AS clustered, FALSE AS checked_out, target_record_id FROM "temp_{1}" temp LEFT JOIN ( SELECT record_id FROM "{0}" WHERE last_update = :last_update ) AS s USING(record_id) WHERE s.record_id IS NULL RETURNING record_id '''.format(entity_table, session_id)) last_update = datetime.now().replace(tzinfo=TIME_ZONE) with engine.begin() as c: c.execute(upd, last_update=last_update) c.execute(ins, last_update=last_update) temp_table.drop(bind=engine) os.remove(fname)
def initializeEntityMap(session_id, fields): engine = worker_session.bind metadata = MetaData() create = ''' CREATE TABLE "exact_match_{0}" AS ( SELECT s.record_id, UNNEST(s.members) as match FROM ( SELECT MIN(record_id) AS record_id, (array_agg(record_id ORDER BY record_id)) [2:array_upper(array_agg(record_id), 1)] AS members FROM "processed_{0}" GROUP BY {1} HAVING (array_length(array_agg(record_id), 1) > 1) ) AS s ) '''.format(session_id, ', '.join(fields)) with engine.begin() as conn: conn.execute('DROP TABLE IF EXISTS "exact_match_{0}"'.format(session_id)) conn.execute(create) exact_table = Table('exact_match_{0}'.format(session_id), metadata, autoload=True, autoload_with=engine, keep_existing=True) rows = worker_session.query(exact_table) entity_table = entity_map('entity_%s' % session_id, metadata) entity_table.drop(engine, checkfirst=True) entity_table.create(engine) s = StringIO() writer = UnicodeCSVWriter(s) now = datetime.now().replace(tzinfo=TIME_ZONE).isoformat() rows = sorted(rows, key=itemgetter(0)) grouped = {} for k, g in groupby(rows, key=itemgetter(0)): rs = [r[1] for r in g] grouped[k] = rs for king,serfs in grouped.items(): entity_id = unicode(uuid4()) writer.writerow([ king, None, entity_id, 1.0, 'raw_{0}'.format(session_id), 'TRUE', 'FALSE', 'exact', now, ]) for serf in serfs: writer.writerow([ serf, king, entity_id, 1.0, 'raw_{0}'.format(session_id), 'TRUE', 'FALSE', 'exact', now, ]) s.seek(0) conn = engine.raw_connection() cur = conn.cursor() cur.copy_expert(''' COPY "entity_{0}" ( record_id, target_record_id, entity_id, confidence, source, clustered, checked_out, match_type, last_update ) FROM STDIN CSV'''.format(session_id), s) conn.commit()
if a_number != b_number: return cmp(a_number,b_number) return cmp(a_subtype,b_subtype) if __name__ == '__main__': if len(sys.argv) < 2: sys.exit('You must provide the filename for the CSV output as an argument to this script.') FILENAME = sys.argv[1] with open(FILENAME,"w") as f: connection = Connection() db = connection[config.LABELS_DB] collection = db[config.LABELS_COLLECTION] labelset = collection.find_one({ 'dataset': 'SF1' }) w = UnicodeCSVWriter(f) w.writerow(['table_code','table_desc','table_universe','table_size','col_code','col_desc','indent','parent','has_children','col_code_2000']) for table_code in sorted(labelset['tables'],cmp=compare_table_codes): t = labelset['tables'][table_code] row_base = [table_code,t['name'],t['universe'],t['size']] for label_code in sorted(t['labels']): l = t['labels'][label_code] row = row_base[:] if l['parent'] is None: parent = '' else: parent = l['parent'] if l['key_2000'] is None: key_2000 = '' else: key_2000 = l['key_2000'] row.extend([l['key'],l['text'],l['indent'],parent,l['has_children'],key_2000]) w.writerow(row)
#!/usr/bin/env python import urllib2 from datetime import date, timedelta from BeautifulSoup import BeautifulSoup from csvkit.unicsv import UnicodeCSVWriter # This creates the csv file using the csvkit module and writes to it, creating the header rows outfile = open("nicar14sched.csv", "w") w = UnicodeCSVWriter(outfile,delimiter=",",encoding="Cp1252") w.writerow(['Subject','Start Date','Start Time','End Date','End Time','All Day Event','Description','Location','Private']) private = False all_day = False #update the URL when you reuse the script next year url = "http://www.ire.org/conferences/nicar-2014/schedule/" #use urllib2 to send a request to the URL and gather the html response response = urllib2.urlopen(url) html = response.read() #read the html and parse it using Beautiful soup soup = BeautifulSoup(html) #update the date of the conference year = 2014 month = 2 adate = 26 the_date=date(year,month,adate) d = timedelta(days=1)
def writeCSV(fpath, output): with open(fpath, 'wb') as f: writer = UnicodeCSVWriter(f) writer.writerows(output)
insert = sql_table.insert() headers = t.headers() rows = [dict(zip(headers, row)) for row in t.to_rows()] for row in rows: c.execute(str(insert), row) conn.commit() else: print 'Already saved report %s' % report_data['detail_url'] c.execute('select date_filed from reports order by date_filed limit 1') oldest_year = parser.parse(c.fetchone()[0]).year c.execute( 'select date_filed from reports order by date_filed desc limit 1') newest_year = parser.parse(c.fetchone()[0]).year c.execute('select * from reports limit 1') header = list(map(lambda x: x[0], c.description)) for year in range(oldest_year, newest_year + 1): oldest_date = '%s-01-01' % year newest_date = '%s-12-31' % year c.execute( 'select * from reports where date_filed >= ? and date_filed <= ?', (oldest_date, newest_date)) rows = c.fetchall() outp = StringIO() writer = UnicodeCSVWriter(outp) writer.writerow(header) writer.writerows(rows) outp.seek(0) k.key = 'Reports/%s.csv' % year k.set_contents_from_file(outp) k.make_public()