def export_county(countyfile): putstart = datetime.datetime.now() putcount = 0 slug = countyfile[:3] if slug not in to_harvest.keys(): print "county isn't in the to_harvest list" return putcount else: county = to_harvest[slug] name = "%s voter registration %s" % (county, year) dataset_slug = slugify(name) dataset_url = '%s/%s/' % (PANDA_DATASET_BASE, dataset_slug) data_url = '%sdata/' % dataset_url #initialize new dataset dataset = { 'name': name, 'description': 'Data from %s %s; to search statewide, visit %s' % (data_month, year, newsweb), 'categories': [u'/api/1.0/category/all-dob/', u'/api/1.0/category/voters/'] } response = panda_put(dataset_url, json.dumps(dataset), params={ 'columns': ','.join(VOTER_COLUMNS), }) with open("%s/%s" % (loadbase, countyfile), 'r') as cf: reader = cdr(cf) for row in reader: put_data['objects'].append({ 'external_id': unicode(row['voter_ID']), 'data': [row[key] for key in VOTER_COLUMNS] }) if len(put_data['objects']) % 500 == 0: print "500 processed" if len(put_data['objects']) == 1000: putcount += 1000 print 'Updating %i rows' % len(put_data['objects']) panda_put(data_url, json.dumps(put_data)) put_data['objects'] = [] if putcount % 10000 == 0: print "loaded so far: %s" % putcount if put_data['objects']: print 'Updating %i rows' % len(put_data['objects']) panda_put(data_url, json.dumps(put_data)) putcount += len(put_data['objects']) put_data['objects'] = [] print "pushed %s rows to panda dataset %s; process took %s" % (putcount, name, (datetime.datetime.now()-putstart) ) return putcount
def prep(filename): prepstart = datetime.datetime.now() slug = filename[:3] rawfile = "%s/%s" % (rawbase, filename) tempfile = "%s/%s_temp.csv" % (temp, slug) prepfile = "%s/%s_prep.csv" % (prepbase, slug) loadfile = "%s/%s.csv" % (loadbase, slug) call("cp %s %s" % (rawheader, tempfile), shell=True) call('cat %s/%s >> %s' % (rawbase, filename, tempfile), shell=True) call("csvcut -t -c %s %s > %s" % (columns, tempfile, prepfile), shell=True) with open(prepfile, 'r') as f: biglist = [] reader = cdr(f) header = reader.fieldnames for row in reader: if row['race'] in raced.keys(): RACE = raced[row['race']] else: RACE = '' if row['party'] in partyd.keys(): PARTY = partyd[row['party']] else: PARTY = '' biglist.append([ row['lname'].strip(), row['fname'].strip(), row['mname'].strip(), row['suffix'].strip(), ' '.join(row['addr1'].split()),#strips extra interior white space row['addr2'].strip(), row['city'].strip(), row['zip'].strip(), row['gender'].strip(), RACE, row['birthdate'].strip(), PARTY, row['areacode'].strip(), row['phone'].strip(), row['email'].strip(), row['voter_ID'].strip() ]) biglist=sorted(biglist)#sorts list of lists based on first field, which is last name with open(loadfile, 'w') as lf: writer = ckw(lf) writer.writerow(header) for entry in biglist: writer.writerow(entry) print "%s ready for loading; prepping took %s" % (loadfile, (datetime.datetime.now()-prepstart))
def load_tickets(): if not os.path.isfile(infile): print "couldn't find %s" % infile else: with open(infile, 'r') as f: reader = cdr(f) for row in reader: runv.processed += 1 PK = row['ID'] ADDR = ", ".join([ row[ky] for ky in ['Address Line 1', 'Address Line 2', 'City', 'State', 'Zip Code'] if row[ky] ]) DL = "%s (%s)" % (row['Driver License Number'], row['Driver License State']) # TAG = "%s (%s)" % (row['Tag Number'], row['Tag State']) put_data['objects'].append({ 'external_id': unicode(PK), 'data': [ row['Last Name'], row['First Name'], row['Middle Name'], row['Suffix'], row['Date Of Birth'], row['Race'].replace('White', 'Wh').replace('Black', 'Bl'), row['Gender'], DL, ADDR, row['Offense Date'], row['Statute Description'], row['Law Enf Agency Name'], row['Law Enf Officer Name'] ] }) if len(put_data['objects']) == 1000: runv.created += 1000 print "shipped %s rows" % runv.created panda_put(data_url, json.dumps(put_data)) put_data['objects'] = [] if put_data['objects']: print 'shipping final %i rows' % len(put_data['objects']) panda_put(data_url, json.dumps(put_data)) runv.created += len(put_data['objects']) put_data['objects'] = [] print "pushed %s rows to panda dataset %s; process took %s" % ( runv.created, dataset_name, (datetime.datetime.now()-runv.starter) )