예제 #1
0
def export_county(countyfile):
    putstart = datetime.datetime.now()
    putcount = 0
    slug = countyfile[:3]
    if slug not in to_harvest.keys():
        print "county isn't in the to_harvest list"
        return putcount
    else:
        county = to_harvest[slug]
        name = "%s voter registration %s" % (county, year)
        dataset_slug = slugify(name)
        dataset_url = '%s/%s/' % (PANDA_DATASET_BASE, dataset_slug)
        data_url = '%sdata/' % dataset_url
        #initialize new dataset
        dataset = {
            'name': name,
            'description': 'Data from %s %s; to search statewide, visit %s' % (data_month, year, newsweb),
            'categories': [u'/api/1.0/category/all-dob/', u'/api/1.0/category/voters/']
        }
        response = panda_put(dataset_url, json.dumps(dataset), params={
            'columns': ','.join(VOTER_COLUMNS),
        })
        with open("%s/%s" % (loadbase, countyfile), 'r') as cf:
            reader = cdr(cf)
            for row in reader:
                put_data['objects'].append({
                    'external_id': unicode(row['voter_ID']),
                    'data': [row[key] for key in VOTER_COLUMNS]
                })
                if len(put_data['objects']) % 500 == 0:
                    print "500 processed"
                if len(put_data['objects']) == 1000:
                    putcount += 1000
                    print 'Updating %i rows' % len(put_data['objects'])
                    panda_put(data_url, json.dumps(put_data))
                    put_data['objects'] = []
                if putcount % 10000 == 0:
                    print "loaded so far: %s" % putcount
        if put_data['objects']:
            print 'Updating %i rows' % len(put_data['objects'])
            panda_put(data_url, json.dumps(put_data))
            putcount += len(put_data['objects'])
            put_data['objects'] = []
        print "pushed %s rows to panda dataset %s; process took %s" % (putcount, 
                                                                        name, 
                                                                        (datetime.datetime.now()-putstart)
                                                                        )
        return putcount
예제 #2
0
def prep(filename):
    prepstart = datetime.datetime.now()
    slug = filename[:3]
    rawfile = "%s/%s" % (rawbase, filename)
    tempfile = "%s/%s_temp.csv" % (temp, slug)
    prepfile = "%s/%s_prep.csv" % (prepbase, slug)
    loadfile = "%s/%s.csv" % (loadbase, slug)
    call("cp %s %s" % (rawheader, tempfile), shell=True)
    call('cat %s/%s >> %s' % (rawbase, filename, tempfile), shell=True)
    call("csvcut -t -c %s %s > %s" % (columns, tempfile, prepfile), shell=True)
    with open(prepfile, 'r') as f:
        biglist = []
        reader = cdr(f)
        header = reader.fieldnames
        for row in reader:
            if row['race'] in raced.keys():
                RACE = raced[row['race']]
            else:
                RACE = ''
            if row['party'] in partyd.keys():
                PARTY = partyd[row['party']] 
            else:
                PARTY = ''
            biglist.append([
                    row['lname'].strip(), 
                    row['fname'].strip(), 
                    row['mname'].strip(), 
                    row['suffix'].strip(), 
                    ' '.join(row['addr1'].split()),#strips extra interior white space
                    row['addr2'].strip(), 
                    row['city'].strip(), 
                    row['zip'].strip(), 
                    row['gender'].strip(), 
                    RACE, 
                    row['birthdate'].strip(), 
                    PARTY, 
                    row['areacode'].strip(), 
                    row['phone'].strip(),
                    row['email'].strip(),
                    row['voter_ID'].strip()
                    ])
        biglist=sorted(biglist)#sorts list of lists based on first field, which is last name
        with open(loadfile, 'w') as lf:
            writer = ckw(lf)
            writer.writerow(header)
            for entry in biglist:
                writer.writerow(entry)
    print "%s ready for loading; prepping took %s" % (loadfile, (datetime.datetime.now()-prepstart))
예제 #3
0
def load_tickets():
    if not os.path.isfile(infile):
        print "couldn't find %s" % infile
    else:
        with open(infile, 'r') as f:
            reader = cdr(f)
            for row in reader:
                runv.processed += 1
                PK = row['ID']
                ADDR = ", ".join([ row[ky] for ky in ['Address Line 1', 'Address Line 2', 'City', 'State', 'Zip Code'] if row[ky] ])    
                DL = "%s (%s)" % (row['Driver License Number'], row['Driver License State'])
                # TAG = "%s (%s)" % (row['Tag Number'], row['Tag State'])
                put_data['objects'].append({
                'external_id': unicode(PK),
                'data': [
                    row['Last Name'],
                    row['First Name'],
                    row['Middle Name'],
                    row['Suffix'],
                    row['Date Of Birth'],
                    row['Race'].replace('White', 'Wh').replace('Black', 'Bl'),
                    row['Gender'],
                    DL,
                    ADDR,
                    row['Offense Date'],
                    row['Statute Description'],
                    row['Law Enf Agency Name'],
                    row['Law Enf Officer Name']
                    ]
                })
                if len(put_data['objects']) == 1000:
                    runv.created += 1000
                    print "shipped %s rows" % runv.created
                    panda_put(data_url, json.dumps(put_data))
                    put_data['objects'] = []
        if put_data['objects']:
            print 'shipping final %i rows' % len(put_data['objects'])
            panda_put(data_url, json.dumps(put_data))
            runv.created += len(put_data['objects'])
            put_data['objects'] = []
        print "pushed %s rows to panda dataset %s; process took %s" % (
                                                                    runv.created, 
                                                                    dataset_name, 
                                                                    (datetime.datetime.now()-runv.starter)
                                                                    )