def get_filing_list(start_date, end_date, max_fails=10, waittime=10): #gets list of available filings from the FEC. #TODO: institute an API key pool or fallback? url = "https://api.open.fec.gov/v1/efile/filings/?per_page=100&sort=-receipt_date" url += "&api_key={}".format(API_KEY) url += "&min_receipt_date={}".format(start_date) url += "&max_receipt_date={}".format(end_date) filings = [] page = 1 fails = 0 while True: #get new filing ids from FEC API resp = requests.get(url + "&page={}".format(page)) page += 1 try: files = resp.json() except: #failed to convert respons to JSON fails += 1 if fails >= max_fails: logging.log( title="FEC download failed", text='Failed to download valid JSON from FEC site {} times' .format(max_fails), tags=["nyt-fec", "result:fail"]) return None time.sleep(waittime) try: results = files['results'] except KeyError: fails += 1 if fails >= max_fails: logging.log( title="FEC download failed", text='Failed to download valid JSON from FEC site {} times' .format(max_fails), tags=["nyt-fec", "result:fail"]) return None time.sleep(waittime) continue if len(results) == 0: break for f in results: if evaluate_filing(f): filings.append(f['file_number']) return filings
def reassign_standardized_donors(filing_id, amended_id): #find all skeda's with donors from the amended filing #that we're about to deactivate matched_transactions = ScheduleA.objects.filter(filing_id=amended_id).exclude(donor=None) i = 0 for transaction in matched_transactions: transaction_id = transaction.transaction_id contributor_last_name = transaction.contributor_last_name new_trans = ScheduleA.objects.filter(transaction_id=transaction_id, filing_id=filing_id) if len(new_trans) == 0: logging.log(title="donor reassignment issue", text="filing {} was amended by filing {} and no transaction could be found for donor reassigment for transaction id {}".format(amended_id, filing_id, transaction_id), tags=["nyt-fec", "result:warning"]) continue if len(new_trans) > 1: logging.log(title="donor reassignment issue", text="filing {} was amended by filing {} and multiple transaction matches were found for {}".format(amended_id, filing_id, transaction_id), tags=["nyt-fec", "result:warning"]) continue new_trans = new_trans[0] if new_trans.contributor_last_name != contributor_last_name: logging.log(title="donor reassignment issue", text="Want to reassign transaction {} from filing {} to filing {} but last names mismatch: {}/{}".format(transaction_id, amended_id, filing_id, contributor_last_name, new_trans.contributor_last_name), tags=["nyt-fec", "result:warning"]) continue new_trans.donor = transaction.donor new_trans.save() transaction.donor = None transaction.save() i += 1 print("reassigned {} transactions from amended filing".format(i))
def load_filings(filing_dir): filing_fieldnames = [f.name for f in Filing._meta.get_fields()] filing_csvs = sorted(os.listdir(filing_dir)) filings_loaded = 0 for filename in filing_csvs: filing_id = filename.split(".")[0] if filename[0] == ".": continue try: int(filing_id) except: logging.log(title="Bad FEC filename", text='did not recognize filing {}'.format(filename), tags=["nyt-fec", "result:warn"]) continue full_filename = "{}{}".format(filing_dir, filename) if not evaluate_filing_file(full_filename, filing_id): continue sys.stdout.write("-------------------\n{}: Started filing {}\n".format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), filing_id)) if load_filing(filing_id, full_filename, filing_fieldnames): logging.log(title="Filing {} loaded".format(filing_id), text='filing {} successfully loaded'.format(filing_id), tags=["nyt-fec", "result:success"]) filings_loaded += 1 logging.log(title="FEC scrape completed".format(filing_id), text='{} filings successfully loaded'.format(filings_loaded), tags=["nyt-fec", "result:success"])
def load_filing(filing, filename, filing_fieldnames): #returns boolean depending on whether filing was loaded #this means the filing already exists #TODO add checking to see if import was successful filing_matches = Filing.objects.filter(filing_id=filing) if len(filing_matches) == 1: if filing_matches[0].status != "FAILED": sys.stdout.write('filing {} already exists\n'.format(filing)) return False else: sys.stdout.write("Reloading {}, it failed perviously\n".format(filing)) #filing does not exist or it failed previously try: filing_dict = process_filing.process_electronic_filing(filename) except Exception as e: logging.log(title="fec2json failed", text="fec2json failed {} {}".format(filing, e), tags=["nyt-fec", "result:fail"]) return False #do not load filings outside of this cycle (these will likely be amendments of old filings) #we check this before we download the filing, but this seems like worth re-checking in case someone manually downloaded a file or somehting coverage_end = filing_dict.get('coverage_through_date') if not check_coverage_dates(filing_dict, coverage_end): create_or_update_filing_status(filing, 'REFUSED') return False #deal with amended filings is_amended = False amends_filing = None if filing_dict['amendment']: is_amended = True #oy, one filer really likes semi-colons. if filing_dict.get('amends_filing'): filing_dict['amends_filing'] = filing_dict['amends_filing'].replace(';','') try: amends_filing_str = filing_dict['amends_filing'] amends_filing = int(amends_filing_str) except ValueError: #should be a warning or possibly critical logging.log(title="Filing {} Failed".format(filing), text='Invalid amendment number {} for filing {}, creating filing and marking as FAILED\n'.format(filing_dict['amends_filing'],filing), tags=["nyt-fec", "result:fail"]) filing_obj = Filing.objects.create(filing_id=filing, status='FAILED') filing_obj.save() return False else: try: amended_filing = Filing.objects.filter(filing_id=amends_filing)[0] except IndexError: #if it's an F24 or F5, which don't always have coverage dates, #it is probably an amendment of an out-of-cycle filing #so do not load it if filing_dict['form'] in ['F24', 'F5']: sys.stdout.write('Filing {} is an amended {} with no base. Probably from an old cycle. Not loading\n'.format(filing, filing_dict['form'])) create_or_update_filing_status(filing, 'REFUSED') return False sys.stdout.write("could not find filing {}, which was amended by {}, so not deactivating any transactions\n".format(amends_filing, filing)) else: #if there are filings that were amended by the amended filing #they also have to be deactivated, so look for them. other_amended_filings = Filing.objects.filter(amends_filing=amended_filing.filing_id) amended_filings = [f for f in other_amended_filings] + [amended_filing] for amended_filing in amended_filings: amended_filing.active = False amended_filing.status = 'SUPERSEDED' amended_filing.save() ScheduleA.objects.filter(filing_id=amends_filing).update(active=False, status='SUPERSEDED') ScheduleB.objects.filter(filing_id=amends_filing).update(active=False, status='SUPERSEDED') ScheduleE.objects.filter(filing_id=amends_filing).update(active=False, status='SUPERSEDED') if filing_dict['form'] in ['F3','F3X','F3P','F5']: #could be a periodic, so see if there are covered forms that need to be deactivated coverage_start_date = filing_dict['coverage_from_date'] coverage_end_date = filing_dict['coverage_through_date'] if coverage_start_date and coverage_end_date: #we're going to start by looking for whole forms to deactivate covered_filings = Filing.objects.filter(date_signed__gte=coverage_start_date, date_signed__lte=coverage_end_date, form__in=['F24','F5'], filer_id=filing_dict['filer_committee_id_number']).exclude(filing_id=filing) #this exclude prevents the current filing from being deactivated if it's already been saved somehow covered_filings.update(active=False, status='COVERED') covered_transactions = ScheduleE.objects.filter(filing_id__in=[f.filing_id for f in covered_filings]) covered_transactions.update(active=False, status='COVERED') #there might be some additional transactions close to the edge of the filing period #that we should deactivate based on inconsistent dates inside filings individual_covered_transactions = ScheduleE.objects.filter(filer_committee_id_number=filing_dict['filer_committee_id_number'], active=True).exclude(filing_id=filing) by_expend_date = individual_covered_transactions.filter(expenditure_date__gte=coverage_start_date, expenditure_date__lte=coverage_end_date) by_expend_date.update(active=False, status='COVERED') by_dissemination_date = individual_covered_transactions.filter(dissemination_date__gte=coverage_start_date, dissemination_date__lte=coverage_end_date) by_dissemination_date.update(active=False, status='COVERED') clean_filing_dict = clean_filing_fields(filing_dict, filing_fieldnames) clean_filing_dict['filing_id'] = filing clean_filing_dict['filer_id'] = filing_dict['filer_committee_id_number'] if len(filing_matches) == 1: filing_matches.update(**clean_filing_dict) filing_obj = filing_matches[0] else: filing_obj = Filing.objects.create(**clean_filing_dict) filing_obj.save() #create or update committee if filing_dict.get('committee_name') is None: filing_obj.committee_name = get_filer_name(filing_dict['filer_committee_id_number']) filing_obj.save() try: comm = Committee.objects.create(fec_id=filing_dict['filer_committee_id_number']) comm.save() except: #committee already exists pass try: committee_fieldnames = [f.name for f in Committee._meta.get_fields()] committee = {} committee['zipcode'] = filing_dict['zip'] for fn in committee_fieldnames: try: field = filing_dict[fn] except: continue committee[fn] = field comm = Committee.objects.filter(fec_id=filing_dict['filer_committee_id_number']).update(**committee) except: sys.stdout.write('failed to update committee\n') #add itemizations - eventually we're going to need to bulk insert here #skedA's try: scha_count = 0 schb_count = 0 sche_count = 0 if 'itemizations' in filing_dict: if 'SchA' in filing_dict['itemizations']: scha_count = load_itemizations(ScheduleA, filing_dict['itemizations']['SchA']) if 'SchB' in filing_dict['itemizations']: schb_count = load_itemizations(ScheduleB, filing_dict['itemizations']['SchB']) if 'SchE' in filing_dict['itemizations']: sche_count = load_itemizations(ScheduleE, filing_dict['itemizations']['SchE']) if 'F57' in filing_dict['itemizations']: sche_count += load_itemizations(ScheduleE, filing_dict['itemizations']['F57']) sys.stdout.write("inserted {} schedule A's\n".format(scha_count)) sys.stdout.write("inserted {} schedule B's\n".format(schb_count)) sys.stdout.write("inserted {} schedule E's\n".format(sche_count)) except: #something failed in the transaction loading, keep the filing as failed #but remove the itemizations filing_obj.status='FAILED' filing_obj.save() create_or_update_filing_status(filing, 'FAILED') ScheduleA.objects.filter(filing_id=filing).delete() ScheduleB.objects.filter(filing_id=filing).delete() ScheduleE.objects.filter(filing_id=filing).delete() logging.log(title="Itemization load failed", text='Something failed in itemizations, marking {} as FAILED'.format(filing), tags=["nyt-fec", "result:fail"]) return False if is_amended and amends_filing: reassign_standardized_donors(filing, amends_filing) #add IE total to f24s if filing_obj.form == 'F24': ies = ScheduleE.objects.filter(filing_id=filing, active=True) filing_obj.computed_ie_total_for_f24 = sum([ie.expenditure_amount for ie in ies]) sys.stdout.write('Marking {} as ACTIVE\n'.format(filing)) filing_obj.status='ACTIVE' filing_obj.save() create_or_update_filing_status(filing, 'SUCCESS') return True