def msa_iter(): msa_re = re.compile(r"^(\d{5})\s{19}(.*) (Metro|Micro)politan Statistical Area$") txt_path = settings.dataset_path('locations', filename='List1.txt') for line in (l.strip() for l in open(txt_path)): m = msa_re.match(line) if m is not None: yield dict(zip(HEADERS, m.groups()[:2]))
def __init__(self): super(CensusTestFilter, self).__init__() self._msas = [] csv_path = settings.dataset_path('default', filename='census.csv') for record in csv.DictReader(open(csv_path)): self._msas.append(record['msa']) self._msas = set(self._msas)
def load_zipcodes(): headers = ( 'country_code','postal_code','name', 'state_name','state_code', 'county_name','county_code', 'community_name','community_code', 'latitude','longitude','accuracy' ) state_histogram = Histogram('state_code') state_histogram.label_length = 2 csv_path = settings.dataset_path('default', filename='zipcodes.txt') run_recipe( CSVSource(open(csv_path), delimiter="\t", fieldnames=headers), FieldKeeper(('postal_code','name','state_code','latitude','longitude')), FieldModifier(('latitude','longitude'), float), FieldMerger({'latlng': ('latitude', 'longitude')}, lambda lat, lng: (lat, lng)), #MongoZipEmitter(), #DebugEmitter(), state_histogram, ) return str(state_histogram)
def load_locations(): conn = MySQLdb.connect( user=settings.MYSQL_USER, passwd=settings.MYSQL_PASS, db=settings.MYSQL_DATABASE, host=settings.MYSQL_HOST, port=settings.MYSQL_PORT, ) cursor = conn.cursor() cursor.execute("""DELETE FROM occupation_category""") cursor.execute("""DELETE FROM occupation""") cursor.close() path = settings.dataset_path(None, filename='occupations.csv') run_recipe( sources.CSVSource(open(path)), ValidOccupationFilter(), CategoryIDFilter(), CategoryEmitter(conn), OccupationEmitter(conn), #emitters.DebugEmitter(), error_stream = emitters.DebugEmitter(), ) conn.commit() conn.close()
def load_naccrra(): csv_path = os.path.join(settings.dataset_path('default'), 'childcarecosts.csv') run_recipe( sources.CSVSource(open(csv_path)), filters.FieldRenamer({ 'state': 'State', 'family_infant': 'Family-Infant', 'family_4': 'Family-4-Year-Old', 'family_school': 'Family-School-Age', 'center_infant': 'Center-Infant', 'center_4': 'Center-4-Year-Old', 'center_school': 'Center-School-Age', }), MongoNACCRRAEmitter(), emitters.CountEmitter(), #emitters.DebugEmitter(), error_stream = emitters.DebugEmitter(), )
super(LocationUpdateEmitter, self).__init__() self._conn = sqlite3.connect(db_path) def emit_record(self, record): stmt = """UPDATE locations SET latitude = ?, longitude = ? WHERE city = ? AND state = ? AND zipcode = ?""" params = ( record['latitude'], record['longitude'], record['city'], record['state'], record['zipcode'], ) self._conn.execute(stmt, params) self._conn.commit() def done(self): self._conn.close() db_path = settings.dataset_path('default', filename='load_nces.db') csv_path = settings.dataset_path('nces', filename='school_locations.csv') run_recipe( SqliteSource(db_path, """SELECT * FROM locations WHERE latitude = '' and longitude = ''"""), FieldMerger({'address': ('city','state','zipcode')}, lambda c, s, z: "%s, %s %s" % (c, s, z), keep_fields=True), GeocoderFilter('Kv3.btLV34EuebZGMzi1KaqI_BOPhPjx7FtbvED.umr8DGUq0NysoGN0XIIIDRU-', 'address'), FieldRemover('address'), CSVEmitter(open(csv_path, 'w'), fieldnames=('city','state','zipcode','latitude','longitude')), DebugEmitter(), )
def load_rpp(): local = {} state = {} with open(settings.dataset_path('rpp', 'A5.coded.csv')) as infile: reader = csv.DictReader(infile) for record in reader: if record['code']: local[record['code']] = { 'overall': record['overall'], 'goods': record['goods'], 'services': record['services'], } with open(settings.dataset_path('rpp', 'A7.csv')) as infile: reader = csv.DictReader(infile) for record in reader: if record['state_abbreviation']: state[record['state_abbreviation']] = record.copy() del state[record['state_abbreviation']]['state'] del state[record['state_abbreviation']]['state_abbreviation'] locations = pymongo.Connection()['k2']['locations'] db = MySQLdb.connect( user=settings.MYSQL_USER, passwd=settings.MYSQL_PASS, db=settings.MYSQL_DATABASE, host=settings.MYSQL_HOST, ) cursor = db.cursor() for loc in locations.find({'good_to_go': True}): record = {} if loc['code'] in local: record.update(local[loc['code']]) if loc['primary_state'] in state: record.update(state[loc['primary_state']]) for key in record.iterkeys(): record[key] = float(record[key]) fields = ['code'] values = [loc['code']] for key, value in record.iteritems(): fields.append(key) values.append(value) stmt = "INSERT INTO rpp (%s) VALUES (%s)" % (",".join(fields), ",".join('%s' for v in values)) print loc['code'] cursor.execute(stmt, values) cursor.close() db.commit()
paths = ( "cu.data.3.AsizeNorthEast", "cu.data.4.AsizeNorthCentral", "cu.data.5.AsizeSouth", "cu.data.6.AsizeWest", "cu.data.7.OtherNorthEast", "cu.data.8.OtherNorthCentral", "cu.data.9.OtherSouth", "cu.data.10.OtherWest", ) items: for filename in paths: path = settings.dataset_path('cpi', filename) with open(path) as infile: for row in csv.DictReader(infile, delimiter='\t'): row = {k: v.strip() for (k, v) in row.iteritems()} sid = row['series_id'] if not sid.startswith('CUU'): continue row['periodicity'] = sid[3] row['area_code'] = sid[4:8] row['item_code'] = sid[8:]
import csv import sys from pymongo import Connection import settings PATH = settings.dataset_path('aff', 'DEC_10_DP_DPDP1_with_ann.csv') def list_columns(): with open(PATH) as infile: reader = csv.reader(infile) rows = [reader.next() for i in xrange(1, 10)] columns = zip(*rows) writer = csv.writer(sys.stdout) writer.writerow(('column', 'S', 'category', 'title', 'HD', 'type')) for i, col in enumerate(columns): writer.writerow(( i + 1, col[0], col[1], " ".join(col[2:7]), col[7], col[8], )) def load_data():