def go(path, max_attempts=5): if not os.path.isfile(path): exit('Make sure you ran `make nomenklatura` in the data dir.') with open(path, 'rb') as f: reader = DictReader(f) update_attempts = 0 restarts = 0 for row in reader: is_updated = process_row(row) if is_updated: if update_attempts: update_attempts = 0 # reset counter restarts += 1 # log how many times we've reset the counter else: update_attempts += 1 if update_attempts >= max_attempts: print('skipping the rest after trying {} and {} resets' .format(update_attempts, restarts)) # we're just running through old entries at this point # since the datafile is in reverse chronological order break # these `Interest`s need updated stats for interest in Interest.objects.filter( canonical__isnull=True, stats__isnull=True): print 'update', interest interest.make_stats()
def scrape(path): logger.info("Processing %s" % path) with open(path, 'rb') as f: for total_rows, row in enumerate(f): # subtract 1 for header row pass f.seek(0) reader = DictReader(f) prev_pass = None first = True new_compensations = [] for row in tqdm(reader, total=total_rows, leave=True, mininterval=1.0, miniters=100): if first: # wipe all `Compensation` objects for the year to avoid double # counting corrected compensations year = row['YEAR_APPL'] Compensation.objects.filter(annum__year=year).delete() first = False prev_pass = process_row(row, prev_pass=prev_pass) if prev_pass.compensation: new_compensations.append(prev_pass.compensation) logger.debug('{} new compensations'.format(len(new_compensations))) Compensation.objects.bulk_create(new_compensations)
def scrape(path, logger=logger): logger.info("Processing %s" % path) with open(path, 'rb') as f: reader = DictReader(f) last_pass = None for row in reader: last_pass = process_row(row, last_pass=last_pass)
def generate_test_row(path, **kwargs): import random from pprint import pprint logger.info("Processing %s" % path) with open(path, 'rb') as f: reader = DictReader(f, encoding='latin_1') for i, row in enumerate(reader): if random.randint(0, 999) < 1: # adjust this to go deeper pprint(row) break
def generate_test_row(path, **kwargs): """Helper to replace `scrape` to print out a sample for testing.""" import random from pprint import pprint with open(path, 'rb') as f: reader = DictReader(f) for row in reader: if random.randint(0, 99) < 1: # adjust this to go deeper pprint(row) break
def go(path): if not os.path.isfile(path): exit('Make sure you ran `make nomenklatura` in the data dir.') with open(path, 'rb') as f: reader = DictReader(f) for row in reader: process_row(row) for interest in Interest.objects.filter(canonical__isnull=True, stats__isnull=True): print 'update', interest interest.make_stats()
def handle(self, csv_path, *args, **options): from tx_lobbying.models import Address with open(csv_path, 'rb') as f: reader = DictReader(f) for row in reader: address, created = Address.objects.update_or_create( address1=row['address1'], address2=row['address2'], city=row['city'], state=row['state'], zipcode=row['zipcode'], defaults=dict( coordinate=row['coordinate'], coordinate_quality=row['coordinate_quality'], )) print address, created
def process_csv(path, _inner_func, **kwargs): logger.info("Processing %s" % path) total = get_record_count(path) with open(path, 'rb') as f: reader = DictReader(f, encoding='latin_1') # store output from the last pass since coversheet and lobbyist don't # really change row to row to save some queries. last_pass = None for i, row in enumerate(reader): if not i % 1000: logger.info(u'{}/{} filed date: {} report date:{}' .format( i, total, row.get('FILED_DATE'), row.get('RPT_DATE') )) if YEAR_START and int(row['YEAR_APPL']) < YEAR_START: continue try: last_pass = _inner_func(row, last_pass=last_pass, **kwargs) except ValueError as e: logger.warn('Row missing data: %s, %s' % (row, e)) continue