def handle(self, *args, **options): filing_headers = Filing_Header.objects.filter(form='F3') for fh in filing_headers: print "Processing filing %s" % (fh.filing_number) f1 = filing(fh.filing_number) form = f1.get_form_type() version = f1.get_version() header = f1.get_first_row() header_line = fp.parse_form_line(header, version) fh.header_data = header_line fh.save() try: this_filing = new_filing.objects.get( filing_number=fh.filing_number) parsed_data = process_f3_header(header_line) #print "got data %s" % (parsed_data) this_filing.coh_end = parsed_data['coh_end'] if parsed_data[ 'coh_end'] else None this_filing.tot_raised = parsed_data[ 'tot_raised'] if parsed_data['tot_raised'] else None this_filing.tot_spent = parsed_data[ 'tot_spent'] if parsed_data['tot_spent'] else None this_filing.new_loans = parsed_data[ 'new_loans'] if parsed_data['new_loans'] else None this_filing.save() except new_filing.DoesNotExist: print "new_filing MISSING FOR %s" % (new_filing.filing_number) continue
def handle(self, *args, **options): filing_headers = Filing_Header.objects.filter(form='F3') for fh in filing_headers: print "Processing filing %s" % (fh.filing_number) f1 = filing(fh.filing_number) form = f1.get_form_type() version = f1.get_version() header = f1.get_first_row() header_line = fp.parse_form_line(header, version) fh.header_data=header_line fh.save() try: this_filing = new_filing.objects.get(filing_number = fh.filing_number) parsed_data = process_f3_header(header_line) #print "got data %s" % (parsed_data) this_filing.coh_end = parsed_data['coh_end'] if parsed_data['coh_end'] else None this_filing.tot_raised = parsed_data['tot_raised'] if parsed_data['tot_raised'] else None this_filing.tot_spent = parsed_data['tot_spent'] if parsed_data['tot_spent'] else None this_filing.new_loans = parsed_data['new_loans'] if parsed_data['new_loans'] else None this_filing.save() except new_filing.DoesNotExist: print "new_filing MISSING FOR %s" % (new_filing.filing_number) continue
def handle(self, *args, **options): # just get the ids--otherwise django will load every column into memory # filter(form='F13') all_headers = Filing_Header.objects.all().order_by('filing_number').values('pk')[:2000] for header_pk in all_headers: pk = header_pk['pk'] header = Filing_Header.objects.get(pk=pk) filingnum = header.filing_number f1 = filing(filingnum, read_from_cache=True, write_to_cache=True) f1.download() form = header.form version = header.version print "processing filingnum %s, form %s version %s" % (filingnum, form, version) line_dict = {} content_rows = f1.get_body_rows() total_lines = 0 for row in content_rows: # instead of parsing the line, just assume form type is the first arg. r_type = row[0].upper().strip() # sometimes there are blank lines within files--see 707076.fec if not r_type: continue total_lines += 1 # what type of line parser would be used here? lp = fp.get_line_parser(r_type) if lp: form = lp.form r_type = form else: print "Missing parser from %s" % (r_type) try: num = line_dict[r_type] line_dict[r_type] = num + 1 except KeyError: line_dict[r_type] = 1 print "Found total lines = %s with dict=%s" % (total_lines, line_dict) #header.lines_present = line_dict #header.save()
def handle(self, *args, **options): # just get the ids--otherwise django will load every column into memory # filter(form='F13') all_headers = Filing_Header.objects.all().order_by( 'filing_number').values('pk')[:2000] for header_pk in all_headers: pk = header_pk['pk'] header = Filing_Header.objects.get(pk=pk) filingnum = header.filing_number f1 = filing(filingnum, read_from_cache=True, write_to_cache=True) f1.download() form = header.form version = header.version print "processing filingnum %s, form %s version %s" % ( filingnum, form, version) line_dict = {} content_rows = f1.get_body_rows() total_lines = 0 for row in content_rows: # instead of parsing the line, just assume form type is the first arg. r_type = row[0].upper().strip() # sometimes there are blank lines within files--see 707076.fec if not r_type: continue total_lines += 1 # what type of line parser would be used here? lp = fp.get_line_parser(r_type) if lp: form = lp.form r_type = form else: print "Missing parser from %s" % (r_type) try: num = line_dict[r_type] line_dict[r_type] = num + 1 except KeyError: line_dict[r_type] = 1 print "Found total lines = %s with dict=%s" % (total_lines, line_dict)
def handle(self, *args, **options): all_filings = Filing_Header.objects.all() for this_filing in all_filings: filing_number = this_filing.filing_number print "Processing %s" % filing_number f1 = filing(filing_number) form = f1.get_form_type() version = f1.get_version() # only parse forms that we're set up to read if not fp.is_allowed_form(form): if verbose: print "Not a parseable form: %s - %s" % (form, filingnum) continue header = f1.get_first_row() header_line = fp.parse_form_line(header, version) this_filing.header_data=header_line this_filing.save()
def handle(self, *args, **options): all_filings = Filing_Header.objects.all() for this_filing in all_filings: filing_number = this_filing.filing_number print "Processing %s" % filing_number f1 = filing(filing_number) form = f1.get_form_type() version = f1.get_version() # only parse forms that we're set up to read if not fp.is_allowed_form(form): if verbose: print "Not a parseable form: %s - %s" % (form, filingnum) continue header = f1.get_first_row() header_line = fp.parse_form_line(header, version) this_filing.header_data = header_line this_filing.save()
def process_file(filingnum): #print "Processing filing %s" % (filingnum) f1 = filing(filingnum, read_from_cache=True, write_to_cache=True) f1.download() form = f1.get_form_type() version = f1.get_version() # only parse forms that we're set up to read if not fp.is_allowed_form(form): #if verbose: # print "Not a parseable form: %s - %s" % (form, filingnum) try: count = unprocessable_form_hash[form] unprocessable_form_hash[form] = count + 1 except KeyError: unprocessable_form_hash[form] = 1 return #if verbose: # print "Found parseable form: %s - %s" % (form, filingnum) header = f1.get_first_row() header_line = fp.parse_form_line(header, version) amended_filing = None if f1.is_amendment: amended_filing = f1.headers['filing_amended'] # enter it if we don't have it already: try: already_entered = Filing_Header.objects.get(filing_number=filingnum) print "Already entered! %s" % (filingnum) return 0 except Filing_Header.DoesNotExist: from_date = None through_date = None try: # dateparse('') will give today, oddly if header_line['coverage_from_date']: from_date = dateparse(header_line['coverage_from_date']) if header_line['coverage_through_date']: through_date = dateparse(header_line['coverage_through_date']) except KeyError: pass new_header_id = Filing_Header.objects.create( raw_filer_id=f1.headers['fec_id'], form=form, filing_number=filingnum, version=f1.version, coverage_from_date=from_date, coverage_through_date=through_date, is_amendment=f1.is_amendment, amends_filing=amended_filing, amendment_number=f1.headers['report_number'] or None, header_data=header_line) #print "Added header with id %s" % new_header_id """ body_rows = f1.get_body_rows() for row in body_rows: # the last line is empty, so don't try to parse it if len(row)>1: # Don't double check, just enter the data. parsed_line = fp.parse_form_line(row, version) parsed_line['filing_number'] = int(filingnum) #if verbose: # print parsed_line new_line_id = filing_lines.insert(parsed_line) """ return 1
def handle(self, *args, **options): # just get the ids--otherwise django will load every column into memory # filter(form='F13') all_headers = Filing_Header.objects.all().order_by('filing_number').values('pk') line_count = 0 for header_pk in all_headers: line_count += 1 if line_count % 1000 == 0: print "Processined %s lines" % line_count pk = header_pk['pk'] header = Filing_Header.objects.get(pk=pk) filingnum = header.filing_number f1 = filing(filingnum, read_from_cache=True, write_to_cache=True) f1.download() form = header.form version = header.version #print "processing filingnum %s, form %s version %s" % (filingnum, form, version) line_dict = {} content_rows = f1.get_body_rows() total_lines = 0 for row in content_rows: # instead of parsing the line, just assume form type is the first arg. r_type = row[0].upper().strip() # sometimes there are blank lines within files--see 707076.fec if not r_type: continue total_lines += 1 # what type of line parser would be used here? lp = fp.get_line_parser(r_type) if lp: form = lp.form r_type = form #print "line parser: %s from %s" % (form, r_type) linedict = fp.parse_form_line(row, version) if form=='SchA': skeda_from_skedadict(linedict, filingnum, header) elif form=='SchB': skedb_from_skedbdict(linedict, filingnum, header) elif form=='SchE': skede_from_skededict(linedict, filingnum, header) # Treat 48-hour contribution notices like sked A. # Requires special handling for amendment, since these are superceded # by regular F3 forms. elif form=='F65': skeda_from_f65(linedict, filingnum, header) # disclosed donor to non-commmittee. Sorta rare, but.. elif form=='F56': skeda_from_f56(linedict, filingnum, header) # disclosed electioneering donor elif form=='F92': skeda_from_f92(linedict, filingnum, header) # inaugural donors elif form=='F132': skeda_from_f132(linedict, filingnum, header) #inaugural refunds elif form=='F133': skeda_from_f133(linedict, filingnum, header) # IE's disclosed by non-committees. Note that they use this for * both * quarterly and 24- hour notices. There's not much consistency with this--be careful with superceding stuff. elif form=='F57': skede_from_f57(linedict, filingnum, header) # Its another kind of line. Just dump it in Other lines. else: otherline_from_line(linedict, filingnum, header, formname=form) else: print "Missing parser from %s" % (r_type) try: num = line_dict[r_type] line_dict[r_type] = num + 1 except KeyError: line_dict[r_type] = 1 #print "Found total lines = %s with dict=%s" % (total_lines, line_dict) header.lines_present = line_dict header.save()
def process_new_filing(thisnewfiling, fp=None, filing_time=None, filing_time_is_exact=False): """ Enter the file header if needed. """ if not fp: fp = form_parser() #print "Processing filing %s" % (filingnum) f1 = filing(thisnewfiling.filing_number) if f1.get_error(): return False form = f1.get_form_type() version = f1.get_version() ## leave the form if it's already been entered-- that's where it says if it is terminated. if not thisnewfiling.form_type: thisnewfiling.form_type = form # check if it's an amendment based on form types -- if so, mark it. Otherwise the F1's will look like they haven't been amended. try: if thisnewfiling.form_type[-1].upper() == 'A': thisnewfiling.is_amendment = True except IndexError: pass # only parse forms that we're set up to read if not fp.is_allowed_form(form): if verbose: print "Not a parseable form: %s - %s" % (form, thisnewfiling.filing_number) if thisnewfiling.is_amendment: thisnewfiling.save() return True header = f1.get_first_row() header_line = fp.parse_form_line(header, version) amended_filing=None if f1.is_amendment: amended_filing = f1.headers['filing_amended'] from_date = None through_date = None #print "header line is: %s " % header_line try: # dateparse('') will give today, oddly if header_line['coverage_from_date']: from_date = dateparse(header_line['coverage_from_date']) if from_date: thisnewfiling.cycle = get_cycle_from_date(from_date) except KeyError: print "problem with coverage_from_date" pass try: if header_line['coverage_through_date']: through_date = dateparse(header_line['coverage_through_date']) if through_date: thisnewfiling.cycle = get_cycle_from_date(through_date) except KeyError: print "coverage_through_date" pass # Create the filing -- but don't mark it as being complete. thisnewfiling.fec_id = f1.headers['fec_id'] thisnewfiling.coverage_from_date = from_date thisnewfiling.coverage_to_date = through_date thisnewfiling.is_amendment = f1.is_amendment thisnewfiling.amends_filing = amended_filing thisnewfiling.amendment_number = f1.headers['report_number'] or None thisnewfiling.header_data = header_line print thisnewfiling.__dict__ thisnewfiling.save() return True
def process_file(filingnum): #print "Processing filing %s" % (filingnum) f1 = filing(filingnum, read_from_cache=True, write_to_cache=True) f1.download() form = f1.get_form_type() version = f1.get_version() # only parse forms that we're set up to read if not fp.is_allowed_form(form): #if verbose: # print "Not a parseable form: %s - %s" % (form, filingnum) try: count = unprocessable_form_hash[form] unprocessable_form_hash[form] = count + 1 except KeyError: unprocessable_form_hash[form] = 1 return #if verbose: # print "Found parseable form: %s - %s" % (form, filingnum) header = f1.get_first_row() header_line = fp.parse_form_line(header, version) amended_filing=None if f1.is_amendment: amended_filing = f1.headers['filing_amended'] # enter it if we don't have it already: try: already_entered = Filing_Header.objects.get(filing_number=filingnum) print "Already entered! %s" % (filingnum) return 0 except Filing_Header.DoesNotExist: from_date = None through_date = None try: # dateparse('') will give today, oddly if header_line['coverage_from_date']: from_date = dateparse(header_line['coverage_from_date']) if header_line['coverage_through_date']: through_date = dateparse(header_line['coverage_through_date']) except KeyError: pass new_header_id = Filing_Header.objects.create( raw_filer_id=f1.headers['fec_id'], form=form, filing_number=filingnum, version=f1.version, coverage_from_date=from_date, coverage_through_date = through_date, is_amendment=f1.is_amendment, amends_filing=amended_filing, amendment_number = f1.headers['report_number'] or None, header_data=header_line) #print "Added header with id %s" % new_header_id """ body_rows = f1.get_body_rows() for row in body_rows: # the last line is empty, so don't try to parse it if len(row)>1: # Don't double check, just enter the data. parsed_line = fp.parse_form_line(row, version) parsed_line['filing_number'] = int(filingnum) #if verbose: # print parsed_line new_line_id = filing_lines.insert(parsed_line) """ return 1
import re from formdata.utils.form_mappers import * from parsing.form_parser import form_parser, ParserMissingError from parsing.filing import filing from parsing.read_FEC_settings import FILECACHE_DIRECTORY from formdata.models import Filing_Header # load up a form parser fp = form_parser() filing_num = 708753 f1 = filing(filingnum, read_from_cache=True, write_to_cache=True) a = re.compile(r'SA*', re.I) rows = f1.get_rows(a) # parse a row parsed_row = fp.parse_form_line(rows[0], version) print parsed_row # parsed_row = {'conduit_zip': '', 'back_reference_sched_name': '', 'donor_candidate_prefix': '', 'contribution_aggregate': '250.00', 'donor_committee_name': '', 'contributor_street_2': '', 'donor_candidate_suffix': '', 'contributor_organization_name': '', 'contributor_suffix': '', 'contributor_state': 'TX', 'donor_committee_fec_id': '', 'entity_type': 'IND', 'donor_candidate_state': '', 'donor_candidate_district': '', 'contributor_prefix': '', 'contributor_last_name': 'Acton', 'donor_candidate_middle_name': '', 'transaction_id': 'SA11AI.30102', 'contribution_date': '20101021', 'contributor_occupation': '', 'filer_committee_id_number': 'C00460808', 'donor_candidate_last_name': '', 'conduit_street2': '', 'conduit_street1': '', 'contributor_city': 'Dallas', 'donor_candidate_first_name': '', 'contribution_purpose_descrip': '', 'election_code': 'G2010', 'donor_candidate_office': '', 'memo_text_description': '', 'donor_candidate_fec_id': '', 'form_type': 'SA11AI', 'contributor_first_name': 'Robert', 'contribution_purpose_code': '', 'election_other_description': '', 'conduit_name': '', 'contribution_amount': '150.00', 'conduit_city': '', 'contributor_employer': '', 'back_reference_tran_id_number': '', 'contributor_street_1': '6407 Meadow Road', 'conduit_state': '', 'reference_code': '', 'memo_code': '', 'contributor_zip': '752305142', 'contributor_middle_name': ''} # can we save it? from formdata.utils.form_mappers import * from formdata.models import Filing_Header
def fix_dissemination_date(this_filing, fp): ## we gotta parse the rows again. print "handling %s line_count=%s" % (this_filing.filing_number, this_filing.lines_present) f1 = filing(this_filing.filing_number) form = f1.get_form_type() version = f1.get_version() filer_id = f1.get_filer_id() # This is being written when the current version is 8.1--the only version to include dissemination date. if not version == '8.1': return None linenum = 0 # run through all the lines: while True: linenum += 1 row = f1.get_body_row() if not row: break linedict = None try: linedict = fp.parse_form_line(row, version) except ParserMissingError: msg = 'process_filing_body: Unknown line type in filing %s line %s: type=%s Skipping.' % ( this_filing.filing_number, linenum, row[0]) # ignore everything but sked E's -- note that sked F57 *does not* have this issue. if linedict['form_parser'] == 'SchE': dissemination_date = linedict['dissemination_date'] expenditure_date = linedict['expenditure_date'] transaction_id = linedict['transaction_id'] print "filingnum=%s dissemination_date=%s expenditure_date=%s transaction_id=%s" % ( this_filing.filing_number, dissemination_date, expenditure_date, transaction_id) # then fix the original date in the db. try: original_line = SkedE.objects.get( filing_number=this_filing.filing_number, transaction_id=transaction_id) if dissemination_date: original_line.dissemination_date = dissemination_date try: original_line.dissemination_date_formatted = dateparse( dissemination_date) original_line.effective_date = original_line.dissemination_date_formatted except ValueError: pass else: original_line.dissemination_date_formatted = None if expenditure_date: original_line.expenditure_date = expenditure_date try: original_line.expenditure_date_formatted = dateparse( expenditure_date) if not original_line.dissemination_date: original_line.effective_date = original_line.expenditure_date_formatted except ValueError: pass else: original_line.expenditure_date_formatted = None if not expenditure_date and not dissemination_date: original_line.effective_date = None if alter_db: original_line.save() except SkedE.DoesNotExist: print "Couldn't find filing%s transaction %s" % ( this_filing.filing_number, transaction_id)
import re from formdata.utils.form_mappers import * from parsing.form_parser import form_parser, ParserMissingError from parsing.filing import filing from parsing.read_FEC_settings import FILECACHE_DIRECTORY from formdata.models import Filing_Header # load up a form parser fp = form_parser() filing_num = 708753 f1 = filing(filingnum, read_from_cache=True, write_to_cache=True) a = re.compile(r'SA*', re.I) rows = f1.get_rows(a) # parse a row parsed_row = fp.parse_form_line(rows[0], version) print parsed_row # parsed_row = {'conduit_zip': '', 'back_reference_sched_name': '', 'donor_candidate_prefix': '', 'contribution_aggregate': '250.00', 'donor_committee_name': '', 'contributor_street_2': '', 'donor_candidate_suffix': '', 'contributor_organization_name': '', 'contributor_suffix': '', 'contributor_state': 'TX', 'donor_committee_fec_id': '', 'entity_type': 'IND', 'donor_candidate_state': '', 'donor_candidate_district': '', 'contributor_prefix': '', 'contributor_last_name': 'Acton', 'donor_candidate_middle_name': '', 'transaction_id': 'SA11AI.30102', 'contribution_date': '20101021', 'contributor_occupation': '', 'filer_committee_id_number': 'C00460808', 'donor_candidate_last_name': '', 'conduit_street2': '', 'conduit_street1': '', 'contributor_city': 'Dallas', 'donor_candidate_first_name': '', 'contribution_purpose_descrip': '', 'election_code': 'G2010', 'donor_candidate_office': '', 'memo_text_description': '', 'donor_candidate_fec_id': '', 'form_type': 'SA11AI', 'contributor_first_name': 'Robert', 'contribution_purpose_code': '', 'election_other_description': '', 'conduit_name': '', 'contribution_amount': '150.00', 'conduit_city': '', 'contributor_employer': '', 'back_reference_tran_id_number': '', 'contributor_street_1': '6407 Meadow Road', 'conduit_state': '', 'reference_code': '', 'memo_code': '', 'contributor_zip': '752305142', 'contributor_middle_name': ''} # can we save it? from formdata.utils.form_mappers import * from formdata.models import Filing_Header header = Filing_Header.objects.get(filing_number=708753)
def fix_dissemination_date(this_filing, fp): ## we gotta parse the rows again. print "handling %s line_count=%s" % (this_filing.filing_number, this_filing.lines_present) f1 = filing(this_filing.filing_number) form = f1.get_form_type() version = f1.get_version() filer_id = f1.get_filer_id() # This is being written when the current version is 8.1--the only version to include dissemination date. if not version == '8.1': return None linenum = 0 # run through all the lines: while True: linenum += 1 row = f1.get_body_row() if not row: break linedict = None try: linedict = fp.parse_form_line(row, version) except ParserMissingError: msg = 'process_filing_body: Unknown line type in filing %s line %s: type=%s Skipping.' % (this_filing.filing_number, linenum, row[0]) # ignore everything but sked E's -- note that sked F57 *does not* have this issue. if linedict['form_parser'] == 'SchE': dissemination_date = linedict['dissemination_date'] expenditure_date = linedict['expenditure_date'] transaction_id = linedict['transaction_id'] print "filingnum=%s dissemination_date=%s expenditure_date=%s transaction_id=%s" % (this_filing.filing_number, dissemination_date, expenditure_date, transaction_id) # then fix the original date in the db. try: original_line = SkedE.objects.get(filing_number=this_filing.filing_number, transaction_id=transaction_id) if dissemination_date: original_line.dissemination_date = dissemination_date try: original_line.dissemination_date_formatted = dateparse(dissemination_date) original_line.effective_date = original_line.dissemination_date_formatted except ValueError: pass else: original_line.dissemination_date_formatted = None if expenditure_date: original_line.expenditure_date = expenditure_date try: original_line.expenditure_date_formatted = dateparse(expenditure_date) if not original_line.dissemination_date: original_line.effective_date = original_line.expenditure_date_formatted except ValueError: pass else: original_line.expenditure_date_formatted = None if not expenditure_date and not dissemination_date: original_line.effective_date = None if alter_db: original_line.save() except SkedE.DoesNotExist: print "Couldn't find filing%s transaction %s" % (this_filing.filing_number, transaction_id)
def process_filing_body(filingnum, fp=None, logger=None): #It's useful to pass the form parser in when running in bulk so we don't have to keep creating new ones. if not fp: fp = form_parser() if not logger: logger=fec_logger() msg = "process_filing_body: Starting # %s" % (filingnum) #print msg logger.info(msg) connection = get_connection() cursor = connection.cursor() cmd = "select fec_id, superseded_by_amendment, data_is_processed from efilings_filing where filing_number=%s" % (filingnum) cursor.execute(cmd) cd = CSV_dumper(connection) result = cursor.fetchone() if not result: msg = 'process_filing_body: Couldn\'t find a new_filing for filing %s' % (filingnum) logger.error(msg) raise FilingHeaderDoesNotExist(msg) # will throw a TypeError if it's missing. line_sequence = 1 is_amended = result[1] is_already_processed = result[2] if is_already_processed == "1": msg = 'process_filing_body: This filing has already been entered.' print msg logger.error(msg) raise FilingHeaderAlreadyProcessed(msg) #print "Processing filing %s" % (filingnum) f1 = filing(filingnum) form = f1.get_form_type() version = f1.get_version() filer_id = f1.get_filer_id() # only parse forms that we're set up to read if not fp.is_allowed_form(form): if verbose: msg = "process_filing_body: Not a parseable form: %s - %s" % (form, filingnum) print msg logger.info(msg) return None linenum = 0 while True: linenum += 1 row = f1.get_body_row() if not row: break #print "row is %s" % (row) #print "\n\n\nForm is %s" % form try: linedict = fp.parse_form_line(row, version) #print "\n\n\nform is %s" % form process_body_row(linedict, filingnum, line_sequence, is_amended, cd, filer_id) except ParserMissingError: msg = 'process_filing_body: Unknown line type in filing %s line %s: type=%s Skipping.' % (filingnum, linenum, row[0]) print msg logger.warn(msg) continue # commit all the leftovers cd.commit_all() cd.close() counter = cd.get_counter() total_rows = 0 for i in counter: total_rows += counter[i] msg = "process_filing_body: Filing # %s Total rows: %s Tally is: %s" % (filingnum, total_rows, counter) print msg logger.info(msg) ######## DIRECT DB UPDATES. PROBABLY A BETTER APPROACH, BUT... header_data = dict_to_hstore(counter) cmd = "update efilings_filing set lines_present='%s'::hstore where filing_number=%s" % (header_data, filingnum) cursor.execute(cmd) # mark file as having been entered. cmd = "update efilings_filing set data_is_processed='1' where filing_number=%s" % (filingnum) cursor.execute(cmd) # flag this filer as one who has changed. cmd = "update efilings_committee set is_dirty=True where fec_id='%s' and cycle='%s'" % (filer_id, CURRENT_CYCLE) cursor.execute(cmd) # should also update the candidate is dirty flag too by joining w/ ccl table. # these tables aren't indexed, so do as two separate queries. cmd = "select cand_id from ftpdata_candcomlink where cmte_id = '%s' and cmte_dsgn in ('A', 'P')" % (filer_id) cursor.execute(cmd) result = cursor.fetchone() if result: cand_id = result[0] cmd = "update efilings_candidate set is_dirty=True where fec_id = '%s' and cycle='%s'" % (cand_id, CURRENT_CYCLE) cursor.execute(cmd) connection.close()
def handle(self, *args, **options): # just get the ids--otherwise django will load every column into memory # filter(form='F13') all_headers = Filing_Header.objects.all().order_by( 'filing_number').values('pk') line_count = 0 for header_pk in all_headers: line_count += 1 if line_count % 1000 == 0: print "Processined %s lines" % line_count pk = header_pk['pk'] header = Filing_Header.objects.get(pk=pk) filingnum = header.filing_number f1 = filing(filingnum, read_from_cache=True, write_to_cache=True) f1.download() form = header.form version = header.version #print "processing filingnum %s, form %s version %s" % (filingnum, form, version) line_dict = {} content_rows = f1.get_body_rows() total_lines = 0 for row in content_rows: # instead of parsing the line, just assume form type is the first arg. r_type = row[0].upper().strip() # sometimes there are blank lines within files--see 707076.fec if not r_type: continue total_lines += 1 # what type of line parser would be used here? lp = fp.get_line_parser(r_type) if lp: form = lp.form r_type = form #print "line parser: %s from %s" % (form, r_type) linedict = fp.parse_form_line(row, version) if form == 'SchA': skeda_from_skedadict(linedict, filingnum, header) elif form == 'SchB': skedb_from_skedbdict(linedict, filingnum, header) elif form == 'SchE': skede_from_skededict(linedict, filingnum, header) # Treat 48-hour contribution notices like sked A. # Requires special handling for amendment, since these are superceded # by regular F3 forms. elif form == 'F65': skeda_from_f65(linedict, filingnum, header) # disclosed donor to non-commmittee. Sorta rare, but.. elif form == 'F56': skeda_from_f56(linedict, filingnum, header) # disclosed electioneering donor elif form == 'F92': skeda_from_f92(linedict, filingnum, header) # inaugural donors elif form == 'F132': skeda_from_f132(linedict, filingnum, header) #inaugural refunds elif form == 'F133': skeda_from_f133(linedict, filingnum, header) # IE's disclosed by non-committees. Note that they use this for * both * quarterly and 24- hour notices. There's not much consistency with this--be careful with superceding stuff. elif form == 'F57': skede_from_f57(linedict, filingnum, header) # Its another kind of line. Just dump it in Other lines. else: otherline_from_line(linedict, filingnum, header, formname=form) else: print "Missing parser from %s" % (r_type) try: num = line_dict[r_type] line_dict[r_type] = num + 1 except KeyError: line_dict[r_type] = 1 #print "Found total lines = %s with dict=%s" % (total_lines, line_dict) header.lines_present = line_dict header.save()
connection = get_connection() cursor = connection.cursor() logger = fec_logger() # Process all .fec files in the FILECACHE_DIRECTORY for d, _, files in os.walk(FILECACHE_DIRECTORY): for this_file in files: # Ignore it if it isn't a numeric fec file, e.g. \d+\.fec if not fec_format_file.match(this_file): continue filingnum = this_file.replace(".fec", "") cd = CSV_dumper(connection) f1 = filing(filingnum) formtype = f1.get_form_type() version = f1.version filer_id = f1.get_filer_id() print "Processing form number %s - type=%s version=%s is_amended: %s" % ( f1.filing_number, formtype, version, f1.is_amendment) print "Headers are: %s" % f1.headers if f1.is_amendment: print "Original filing is: %s" % (f1.headers['filing_amended']) if not fp.is_allowed_form(formtype): print "skipping form %s - %s isn't parseable" % (f1.filing_number, formtype) continue
def process_filing_body(filingnum, fp=None, logger=None): # It's useful to pass the form parser in when running in bulk so we don't have to keep creating new ones. if not fp: fp = form_parser() if not logger: logger = fec_logger() msg = "process_filing_body: Starting # %s" % (filingnum) # print msg logger.info(msg) connection = get_connection() cursor = connection.cursor() cmd = "select fec_id, superseded_by_amendment, data_is_processed from efilings_filing where filing_number=%s" % ( filingnum ) cursor.execute(cmd) cd = CSV_dumper(connection) result = cursor.fetchone() if not result: msg = "process_filing_body: Couldn't find a new_filing for filing %s" % (filingnum) logger.error(msg) raise FilingHeaderDoesNotExist(msg) # will throw a TypeError if it's missing. line_sequence = 1 is_amended = result[1] is_already_processed = result[2] if is_already_processed == "1": msg = "process_filing_body: This filing has already been entered." print msg logger.error(msg) raise FilingHeaderAlreadyProcessed(msg) # print "Processing filing %s" % (filingnum) f1 = filing(filingnum) form = f1.get_form_type() version = f1.get_version() filer_id = f1.get_filer_id() # only parse forms that we're set up to read if not fp.is_allowed_form(form): if verbose: msg = "process_filing_body: Not a parseable form: %s - %s" % (form, filingnum) print msg logger.info(msg) return None linenum = 0 while True: linenum += 1 row = f1.get_body_row() if not row: break # print "row is %s" % (row) # print "\n\n\nForm is %s" % form try: linedict = fp.parse_form_line(row, version) # print "\n\n\nform is %s" % form process_body_row(linedict, filingnum, line_sequence, is_amended, cd, filer_id) except ParserMissingError: msg = "process_filing_body: Unknown line type in filing %s line %s: type=%s Skipping." % ( filingnum, linenum, row[0], ) print msg logger.warn(msg) continue # commit all the leftovers cd.commit_all() cd.close() counter = cd.get_counter() total_rows = 0 for i in counter: total_rows += counter[i] msg = "process_filing_body: Filing # %s Total rows: %s Tally is: %s" % (filingnum, total_rows, counter) print msg logger.info(msg) ######## DIRECT DB UPDATES. PROBABLY A BETTER APPROACH, BUT... header_data = dict_to_hstore(counter) cmd = "update efilings_filing set lines_present='%s'::hstore where filing_number=%s" % (header_data, filingnum) cursor.execute(cmd) # mark file as having been entered. cmd = "update efilings_filing set data_is_processed='1' where filing_number=%s" % (filingnum) cursor.execute(cmd) # flag this filer as one who has changed. cmd = "update efilings_committee set is_dirty=True where fec_id='%s' and cycle='%s'" % (filer_id, CURRENT_CYCLE) cursor.execute(cmd) # should also update the candidate is dirty flag too by joining w/ ccl table. # these tables aren't indexed, so do as two separate queries. cmd = "select cand_id from ftpdata_candcomlink where cmte_id = '%s' and cmte_dsgn in ('A', 'P')" % (filer_id) cursor.execute(cmd) result = cursor.fetchone() if result: cand_id = result[0] cmd = "update efilings_candidate set is_dirty=True where fec_id = '%s' and cycle='%s'" % ( cand_id, CURRENT_CYCLE, ) cursor.execute(cmd) connection.close()
cursor = connection.cursor() logger = fec_logger() # Process all .fec files in the FILECACHE_DIRECTORY for d, _, files in os.walk(FILECACHE_DIRECTORY): for this_file in files: # Ignore it if it isn't a numeric fec file, e.g. \d+\.fec if not fec_format_file.match(this_file): continue filingnum = this_file.replace(".fec", "") cd = CSV_dumper(connection) f1 = filing(filingnum) formtype = f1.get_form_type() version = f1.version filer_id = f1.get_filer_id() print "Processing form number %s - type=%s version=%s is_amended: %s" % ( f1.filing_number, formtype, version, f1.is_amendment, ) print "Headers are: %s" % f1.headers if f1.is_amendment: print "Original filing is: %s" % (f1.headers["filing_amended"])
def process_filing_body(filingnum, fp=None, logger=None): #It's useful to pass the form parser in when running in bulk so we don't have to keep creating new ones. if not fp: fp = form_parser() if not logger: logger=fec_logger() msg = "process_filing_body: Starting # %s" % (filingnum) #print msg logger.info(msg) connection = get_connection() cursor = connection.cursor() cmd = "select fec_id, is_superceded, data_is_processed from fec_alerts_new_filing where filing_number=%s" % (filingnum) cursor.execute(cmd) cd = CSV_dumper(connection) result = cursor.fetchone() if not result: msg = 'process_filing_body: Couldn\'t find a new_filing for filing %s' % (filingnum) logger.error(msg) raise FilingHeaderDoesNotExist(msg) # will throw a TypeError if it's missing. header_id = 1 is_amended = result[1] is_already_processed = result[2] if is_already_processed: msg = 'process_filing_body: This filing has already been entered.' logger.error(msg) raise FilingHeaderAlreadyProcessed(msg) #print "Processing filing %s" % (filingnum) try: f1 = filing(filingnum) except: print "*** couldn't handle filing %s" % (filingnum) return False form = f1.get_form_type() version = f1.get_version() filer_id = f1.get_filer_id() # only parse forms that we're set up to read if not fp.is_allowed_form(form): if verbose: msg = "process_filing_body: Not a parseable form: %s - %s" % (form, filingnum) # print msg logger.error(msg) return None linenum = 0 while True: linenum += 1 row = f1.get_body_row() if not row: break #print "row is %s" % (row) #print "\n\n\nForm is %s" % form try: linedict = fp.parse_form_line(row, version) if linedict['form_type'].upper().startswith('SE'): print "\n\n\nfiling %s form is %s transaction_id is: %s" % (filingnum, linedict['form_type'], linedict['transaction_id']) # make sure the transaction isn't already there before entering. try: SkedE.objects.get(filing_number=filingnum, transaction_id=linedict['transaction_id']) except SkedE.DoesNotExist: process_body_row(linedict, filingnum, header_id, is_amended, cd, filer_id) elif linedict['form_type'].upper().startswith('SA'): print "\n\n\nfiling %s form is %s transaction_id is: %s" % (filingnum, linedict['form_type'], linedict['transaction_id']) # make sure the transaction isn't already there before entering. try: SkedA.objects.get(filing_number=filingnum, transaction_id=linedict['transaction_id']) print "Already present! %s form is %s transaction_id is: %s" % (filingnum, linedict['form_type'], linedict['transaction_id']) except SkedA.DoesNotExist: process_body_row(linedict, filingnum, header_id, is_amended, cd, filer_id) elif linedict['form_type'].upper().startswith('SB'): print "\n\n\nfiling %s form is %s transaction_id is: %s" % (filingnum, linedict['form_type'], linedict['transaction_id']) # make sure the transaction isn't already there before entering. try: SkedB.objects.get(filing_number=filingnum, transaction_id=linedict['transaction_id']) print "Already present! %s form is %s transaction_id is: %s" % (filingnum, linedict['form_type'], linedict['transaction_id']) except SkedB.DoesNotExist: process_body_row(linedict, filingnum, header_id, is_amended, cd, filer_id) except ParserMissingError: msg = 'process_filing_body: Unknown line type in filing %s line %s: type=%s Skipping.' % (filingnum, linenum, row[0]) logger.warn(msg) continue except KeyError: "missing form type? in filing %s" % (filingnum) # commit all the leftovers cd.commit_all() cd.close() counter = cd.get_counter() total_rows = 0 for i in counter: total_rows += counter[i] msg = "process_filing_body: Filing # %s Total rows: %s Tally is: %s" % (filingnum, total_rows, counter) # print msg logger.info(msg) # don't commit during testing of fix # this data has been moved here. At some point we should pick a single location for this data. header_data = dict_to_hstore(counter) cmd = "update fec_alerts_new_filing set lines_present='%s'::hstore where filing_number=%s" % (header_data, filingnum) cursor.execute(cmd) # mark file as having been entered. cmd = "update fec_alerts_new_filing set data_is_processed = True where filing_number=%s" % (filingnum) cursor.execute(cmd) # flag this filer as one who has changed. cmd = "update summary_data_committee_overlay set is_dirty=True where fec_id='%s'" % (filer_id) cursor.execute(cmd)
def process_filing_body(filingnum, fp=None, logger=None): #It's useful to pass the form parser in when running in bulk so we don't have to keep creating new ones. if not fp: fp = form_parser() if not logger: logger = fec_logger() msg = "process_filing_body: Starting # %s" % (filingnum) #print msg logger.info(msg) connection = get_connection() cursor = connection.cursor() cmd = "select fec_id, is_superceded, data_is_processed from fec_alerts_new_filing where filing_number=%s" % ( filingnum) cursor.execute(cmd) cd = CSV_dumper(connection) result = cursor.fetchone() if not result: msg = 'process_filing_body: Couldn\'t find a new_filing for filing %s' % ( filingnum) logger.error(msg) raise FilingHeaderDoesNotExist(msg) # will throw a TypeError if it's missing. header_id = 1 is_amended = result[1] is_already_processed = result[2] if is_already_processed: msg = 'process_filing_body: This filing has already been entered.' logger.error(msg) raise FilingHeaderAlreadyProcessed(msg) #print "Processing filing %s" % (filingnum) f1 = filing(filingnum) form = f1.get_form_type() version = f1.get_version() filer_id = f1.get_filer_id() # only parse forms that we're set up to read if not fp.is_allowed_form(form): if verbose: msg = "process_filing_body: Not a parseable form: %s - %s" % ( form, filingnum) # print msg logger.error(msg) return None linenum = 0 while True: linenum += 1 row = f1.get_body_row() if not row: break #print "row is %s" % (row) #print "\n\n\nForm is %s" % form try: linedict = fp.parse_form_line(row, version) #print "\n\n\nform is %s" % form process_body_row(linedict, filingnum, header_id, is_amended, cd, filer_id) except ParserMissingError: msg = 'process_filing_body: Unknown line type in filing %s line %s: type=%s Skipping.' % ( filingnum, linenum, row[0]) logger.warn(msg) continue # commit all the leftovers cd.commit_all() cd.close() counter = cd.get_counter() total_rows = 0 for i in counter: total_rows += counter[i] msg = "process_filing_body: Filing # %s Total rows: %s Tally is: %s" % ( filingnum, total_rows, counter) # print msg logger.info(msg) # this data has been moved here. At some point we should pick a single location for this data. header_data = dict_to_hstore(counter) cmd = "update fec_alerts_new_filing set lines_present='%s'::hstore where filing_number=%s" % ( header_data, filingnum) cursor.execute(cmd) # mark file as having been entered. cmd = "update fec_alerts_new_filing set data_is_processed = True where filing_number=%s" % ( filingnum) cursor.execute(cmd) # flag this filer as one who has changed. cmd = "update summary_data_committee_overlay set is_dirty=True where fec_id='%s'" % ( filer_id) cursor.execute(cmd)