def __init__(self, queue, id, starttime): super(Worker, self).__init__() self.queue = queue self.id = id self.starttime = starttime # give each worker it's own fp -- does it need it ? self.fp = form_parser()
def __init__(self, queue, id, starttime): super(Worker, self).__init__() self.queue= queue self.id = id self.starttime = starttime # give each worker it's own fp -- does it need it ? self.fp = form_parser()
def handle(self, *args, **options): # get all the fec filings with IEs in them: all_ie_filings = new_filing.objects.filter(tot_ies__gt=0, filed_date__lt=date( 2014, 1, 14)) fp = form_parser() for f in all_ie_filings: fix_dissemination_date(f, fp)
def handle(self, *args, **options): fp = form_parser() filings_to_queue = Filing.objects.filter( filing_is_downloaded="1", header_is_processed="1", previous_amendments_processed="1", data_is_processed="0").order_by('filing_number').exclude( form_type__in=excluded_filings_list) for filing in filings_to_queue: ######### don't actually do this msg = "send_body_row_jobs: Adding filing %s to entry queue" % ( filing.filing_number) print msg logger.info(msg) #process_filing_body_celery.apply_async([filing.filing_number], queue='slow',routing_key="slow") # Passing in the fp means we don't have to create a new one each time # Giving it the logger will record output in django logs. process_filing_body(filing.filing_number, fp, logger)
def handle(self, *args, **options): fp = form_parser() filings_to_queue = ( Filing.objects.filter( filing_is_downloaded="1", header_is_processed="1", previous_amendments_processed="1", data_is_processed="0", ) .order_by("filing_number") .exclude(form_type__in=excluded_filings_list) ) for filing in filings_to_queue: ######### don't actually do this msg = "send_body_row_jobs: Adding filing %s to entry queue" % (filing.filing_number) print msg logger.info(msg) # process_filing_body_celery.apply_async([filing.filing_number], queue='slow',routing_key="slow") # Passing in the fp means we don't have to create a new one each time # Giving it the logger will record output in django logs. process_filing_body(filing.filing_number, fp, logger)
def process_filing_body(filingnum, fp=None, logger=None): #It's useful to pass the form parser in when running in bulk so we don't have to keep creating new ones. if not fp: fp = form_parser() if not logger: logger=fec_logger() msg = "process_filing_body: Starting # %s" % (filingnum) #print msg logger.info(msg) connection = get_connection() cursor = connection.cursor() cmd = "select fec_id, is_superceded, data_is_processed from fec_alerts_new_filing where filing_number=%s" % (filingnum) cursor.execute(cmd) cd = CSV_dumper(connection) result = cursor.fetchone() if not result: msg = 'process_filing_body: Couldn\'t find a new_filing for filing %s' % (filingnum) logger.error(msg) raise FilingHeaderDoesNotExist(msg) # will throw a TypeError if it's missing. header_id = 1 is_amended = result[1] is_already_processed = result[2] if is_already_processed: msg = 'process_filing_body: This filing has already been entered.' logger.error(msg) raise FilingHeaderAlreadyProcessed(msg) #print "Processing filing %s" % (filingnum) try: f1 = filing(filingnum) except: print "*** couldn't handle filing %s" % (filingnum) return False form = f1.get_form_type() version = f1.get_version() filer_id = f1.get_filer_id() # only parse forms that we're set up to read if not fp.is_allowed_form(form): if verbose: msg = "process_filing_body: Not a parseable form: %s - %s" % (form, filingnum) # print msg logger.error(msg) return None linenum = 0 while True: linenum += 1 row = f1.get_body_row() if not row: break #print "row is %s" % (row) #print "\n\n\nForm is %s" % form try: linedict = fp.parse_form_line(row, version) if linedict['form_type'].upper().startswith('SE'): print "\n\n\nfiling %s form is %s transaction_id is: %s" % (filingnum, linedict['form_type'], linedict['transaction_id']) # make sure the transaction isn't already there before entering. try: SkedE.objects.get(filing_number=filingnum, transaction_id=linedict['transaction_id']) except SkedE.DoesNotExist: process_body_row(linedict, filingnum, header_id, is_amended, cd, filer_id) elif linedict['form_type'].upper().startswith('SA'): print "\n\n\nfiling %s form is %s transaction_id is: %s" % (filingnum, linedict['form_type'], linedict['transaction_id']) # make sure the transaction isn't already there before entering. try: SkedA.objects.get(filing_number=filingnum, transaction_id=linedict['transaction_id']) print "Already present! %s form is %s transaction_id is: %s" % (filingnum, linedict['form_type'], linedict['transaction_id']) except SkedA.DoesNotExist: process_body_row(linedict, filingnum, header_id, is_amended, cd, filer_id) elif linedict['form_type'].upper().startswith('SB'): print "\n\n\nfiling %s form is %s transaction_id is: %s" % (filingnum, linedict['form_type'], linedict['transaction_id']) # make sure the transaction isn't already there before entering. try: SkedB.objects.get(filing_number=filingnum, transaction_id=linedict['transaction_id']) print "Already present! %s form is %s transaction_id is: %s" % (filingnum, linedict['form_type'], linedict['transaction_id']) except SkedB.DoesNotExist: process_body_row(linedict, filingnum, header_id, is_amended, cd, filer_id) except ParserMissingError: msg = 'process_filing_body: Unknown line type in filing %s line %s: type=%s Skipping.' % (filingnum, linenum, row[0]) logger.warn(msg) continue except KeyError: "missing form type? in filing %s" % (filingnum) # commit all the leftovers cd.commit_all() cd.close() counter = cd.get_counter() total_rows = 0 for i in counter: total_rows += counter[i] msg = "process_filing_body: Filing # %s Total rows: %s Tally is: %s" % (filingnum, total_rows, counter) # print msg logger.info(msg) # don't commit during testing of fix # this data has been moved here. At some point we should pick a single location for this data. header_data = dict_to_hstore(counter) cmd = "update fec_alerts_new_filing set lines_present='%s'::hstore where filing_number=%s" % (header_data, filingnum) cursor.execute(cmd) # mark file as having been entered. cmd = "update fec_alerts_new_filing set data_is_processed = True where filing_number=%s" % (filingnum) cursor.execute(cmd) # flag this filer as one who has changed. cmd = "update summary_data_committee_overlay set is_dirty=True where fec_id='%s'" % (filer_id) cursor.execute(cmd)
# mark file as having been entered. cmd = "update fec_alerts_new_filing set data_is_processed = True where filing_number=%s" % (filingnum) cursor.execute(cmd) # flag this filer as one who has changed. cmd = "update summary_data_committee_overlay set is_dirty=True where fec_id='%s'" % (filer_id) cursor.execute(cmd) # if __name__ == '__main__': #filings = new_filing.objects.filter(filing_number__gt=1007393, data_is_processed=False, filing_is_downloaded=True, header_is_processed=True) fp = form_parser() filings = [1010304,] for this_filing in filings: process_filing_body(this_filing, fp=fp) """ t0 = time.time() process_filing_body(864353) # 869853, 869866 #for fn in [869888]: # process_filing_body(fn, fp) t1 = time.time() print "total time = " + str(t1-t0)
from django.core.management.base import BaseCommand, CommandError from parsing.form_parser import form_parser, ParserMissingError from parsing.filing import filing from parsing.read_FEC_settings import FILECACHE_DIRECTORY from fec_alerts.models import new_filing #from formdata.models import Filing_Header from fec_alerts.utils.filing_processors import process_new_filing # load up a form parser fp = form_parser() class Command(BaseCommand): help = "Enter file headers; don't mark them as either amended or not." requires_model_validation = False def handle(self, *args, **options): downloaded_filings = new_filing.objects.filter(filing_is_downloaded=True, header_is_processed=False).order_by('filing_number') for filing in downloaded_filings: print "Entering filing %s, entry_time %s" % (filing.filing_number, filing.process_time) result_header = None try: result_header = process_new_filing(filing, fp=fp, filing_time=filing.process_time, filing_time_is_exact=True) ## It seems like the FEC's response is now to give a page not found response instead of a 500 error or something. The result is that the except no longer seems to apply. except IOError:
def process_filing_body(filingnum, fp=None, logger=None): # It's useful to pass the form parser in when running in bulk so we don't have to keep creating new ones. if not fp: fp = form_parser() if not logger: logger = fec_logger() msg = "process_filing_body: Starting # %s" % (filingnum) # print msg logger.info(msg) connection = get_connection() cursor = connection.cursor() cmd = "select fec_id, superseded_by_amendment, data_is_processed from efilings_filing where filing_number=%s" % ( filingnum ) cursor.execute(cmd) cd = CSV_dumper(connection) result = cursor.fetchone() if not result: msg = "process_filing_body: Couldn't find a new_filing for filing %s" % (filingnum) logger.error(msg) raise FilingHeaderDoesNotExist(msg) # will throw a TypeError if it's missing. line_sequence = 1 is_amended = result[1] is_already_processed = result[2] if is_already_processed == "1": msg = "process_filing_body: This filing has already been entered." print msg logger.error(msg) raise FilingHeaderAlreadyProcessed(msg) # print "Processing filing %s" % (filingnum) f1 = filing(filingnum) form = f1.get_form_type() version = f1.get_version() filer_id = f1.get_filer_id() # only parse forms that we're set up to read if not fp.is_allowed_form(form): if verbose: msg = "process_filing_body: Not a parseable form: %s - %s" % (form, filingnum) print msg logger.info(msg) return None linenum = 0 while True: linenum += 1 row = f1.get_body_row() if not row: break # print "row is %s" % (row) # print "\n\n\nForm is %s" % form try: linedict = fp.parse_form_line(row, version) # print "\n\n\nform is %s" % form process_body_row(linedict, filingnum, line_sequence, is_amended, cd, filer_id) except ParserMissingError: msg = "process_filing_body: Unknown line type in filing %s line %s: type=%s Skipping." % ( filingnum, linenum, row[0], ) print msg logger.warn(msg) continue # commit all the leftovers cd.commit_all() cd.close() counter = cd.get_counter() total_rows = 0 for i in counter: total_rows += counter[i] msg = "process_filing_body: Filing # %s Total rows: %s Tally is: %s" % (filingnum, total_rows, counter) print msg logger.info(msg) ######## DIRECT DB UPDATES. PROBABLY A BETTER APPROACH, BUT... header_data = dict_to_hstore(counter) cmd = "update efilings_filing set lines_present='%s'::hstore where filing_number=%s" % (header_data, filingnum) cursor.execute(cmd) # mark file as having been entered. cmd = "update efilings_filing set data_is_processed='1' where filing_number=%s" % (filingnum) cursor.execute(cmd) # flag this filer as one who has changed. cmd = "update efilings_committee set is_dirty=True where fec_id='%s' and cycle='%s'" % (filer_id, CURRENT_CYCLE) cursor.execute(cmd) # should also update the candidate is dirty flag too by joining w/ ccl table. # these tables aren't indexed, so do as two separate queries. cmd = "select cand_id from ftpdata_candcomlink where cmte_id = '%s' and cmte_dsgn in ('A', 'P')" % (filer_id) cursor.execute(cmd) result = cursor.fetchone() if result: cand_id = result[0] cmd = "update efilings_candidate set is_dirty=True where fec_id = '%s' and cycle='%s'" % ( cand_id, CURRENT_CYCLE, ) cursor.execute(cmd) connection.close()
def handle(self, *args, **options): # get all the fec filings with IEs in them: all_ie_filings = new_filing.objects.filter(tot_ies__gt=0, filed_date__lt=date(2014,1,14)) fp = form_parser() for f in all_ie_filings: fix_dissemination_date(f, fp)
def process_new_filing(thisnewfiling, fp=None, filing_time=None, filing_time_is_exact=False): """ Enter the file header if needed. """ if not fp: fp = form_parser() #print "Processing filing %s" % (filingnum) f1 = filing(thisnewfiling.filing_number) if f1.get_error(): return False form = f1.get_form_type() version = f1.get_version() ## leave the form if it's already been entered-- that's where it says if it is terminated. if not thisnewfiling.form_type: thisnewfiling.form_type = form # check if it's an amendment based on form types -- if so, mark it. Otherwise the F1's will look like they haven't been amended. try: if thisnewfiling.form_type[-1].upper() == 'A': thisnewfiling.is_amendment = True except IndexError: pass # only parse forms that we're set up to read if not fp.is_allowed_form(form): if verbose: print "Not a parseable form: %s - %s" % (form, thisnewfiling.filing_number) if thisnewfiling.is_amendment: thisnewfiling.save() return True header = f1.get_first_row() header_line = fp.parse_form_line(header, version) amended_filing=None if f1.is_amendment: amended_filing = f1.headers['filing_amended'] from_date = None through_date = None #print "header line is: %s " % header_line try: # dateparse('') will give today, oddly if header_line['coverage_from_date']: from_date = dateparse(header_line['coverage_from_date']) if from_date: thisnewfiling.cycle = get_cycle_from_date(from_date) except KeyError: print "problem with coverage_from_date" pass try: if header_line['coverage_through_date']: through_date = dateparse(header_line['coverage_through_date']) if through_date: thisnewfiling.cycle = get_cycle_from_date(through_date) except KeyError: print "coverage_through_date" pass # Create the filing -- but don't mark it as being complete. thisnewfiling.fec_id = f1.headers['fec_id'] thisnewfiling.coverage_from_date = from_date thisnewfiling.coverage_to_date = through_date thisnewfiling.is_amendment = f1.is_amendment thisnewfiling.amends_filing = amended_filing thisnewfiling.amendment_number = f1.headers['report_number'] or None thisnewfiling.header_data = header_line print thisnewfiling.__dict__ thisnewfiling.save() return True
def process_filing_body(filingnum, fp=None, logger=None): #It's useful to pass the form parser in when running in bulk so we don't have to keep creating new ones. if not fp: fp = form_parser() if not logger: logger = fec_logger() msg = "process_filing_body: Starting # %s" % (filingnum) #print msg logger.info(msg) connection = get_connection() cursor = connection.cursor() cmd = "select fec_id, is_superceded, data_is_processed from fec_alerts_new_filing where filing_number=%s" % ( filingnum) cursor.execute(cmd) cd = CSV_dumper(connection) result = cursor.fetchone() if not result: msg = 'process_filing_body: Couldn\'t find a new_filing for filing %s' % ( filingnum) logger.error(msg) raise FilingHeaderDoesNotExist(msg) # will throw a TypeError if it's missing. header_id = 1 is_amended = result[1] is_already_processed = result[2] if is_already_processed: msg = 'process_filing_body: This filing has already been entered.' logger.error(msg) raise FilingHeaderAlreadyProcessed(msg) #print "Processing filing %s" % (filingnum) f1 = filing(filingnum) form = f1.get_form_type() version = f1.get_version() filer_id = f1.get_filer_id() # only parse forms that we're set up to read if not fp.is_allowed_form(form): if verbose: msg = "process_filing_body: Not a parseable form: %s - %s" % ( form, filingnum) # print msg logger.error(msg) return None linenum = 0 while True: linenum += 1 row = f1.get_body_row() if not row: break #print "row is %s" % (row) #print "\n\n\nForm is %s" % form try: linedict = fp.parse_form_line(row, version) #print "\n\n\nform is %s" % form process_body_row(linedict, filingnum, header_id, is_amended, cd, filer_id) except ParserMissingError: msg = 'process_filing_body: Unknown line type in filing %s line %s: type=%s Skipping.' % ( filingnum, linenum, row[0]) logger.warn(msg) continue # commit all the leftovers cd.commit_all() cd.close() counter = cd.get_counter() total_rows = 0 for i in counter: total_rows += counter[i] msg = "process_filing_body: Filing # %s Total rows: %s Tally is: %s" % ( filingnum, total_rows, counter) # print msg logger.info(msg) # this data has been moved here. At some point we should pick a single location for this data. header_data = dict_to_hstore(counter) cmd = "update fec_alerts_new_filing set lines_present='%s'::hstore where filing_number=%s" % ( header_data, filingnum) cursor.execute(cmd) # mark file as having been entered. cmd = "update fec_alerts_new_filing set data_is_processed = True where filing_number=%s" % ( filingnum) cursor.execute(cmd) # flag this filer as one who has changed. cmd = "update summary_data_committee_overlay set is_dirty=True where fec_id='%s'" % ( filer_id) cursor.execute(cmd)
def process_filing_body(filingnum, fp=None, logger=None): #It's useful to pass the form parser in when running in bulk so we don't have to keep creating new ones. if not fp: fp = form_parser() if not logger: logger=fec_logger() msg = "process_filing_body: Starting # %s" % (filingnum) #print msg logger.info(msg) connection = get_connection() cursor = connection.cursor() cmd = "select fec_id, superseded_by_amendment, data_is_processed from efilings_filing where filing_number=%s" % (filingnum) cursor.execute(cmd) cd = CSV_dumper(connection) result = cursor.fetchone() if not result: msg = 'process_filing_body: Couldn\'t find a new_filing for filing %s' % (filingnum) logger.error(msg) raise FilingHeaderDoesNotExist(msg) # will throw a TypeError if it's missing. line_sequence = 1 is_amended = result[1] is_already_processed = result[2] if is_already_processed == "1": msg = 'process_filing_body: This filing has already been entered.' print msg logger.error(msg) raise FilingHeaderAlreadyProcessed(msg) #print "Processing filing %s" % (filingnum) f1 = filing(filingnum) form = f1.get_form_type() version = f1.get_version() filer_id = f1.get_filer_id() # only parse forms that we're set up to read if not fp.is_allowed_form(form): if verbose: msg = "process_filing_body: Not a parseable form: %s - %s" % (form, filingnum) print msg logger.info(msg) return None linenum = 0 while True: linenum += 1 row = f1.get_body_row() if not row: break #print "row is %s" % (row) #print "\n\n\nForm is %s" % form try: linedict = fp.parse_form_line(row, version) #print "\n\n\nform is %s" % form process_body_row(linedict, filingnum, line_sequence, is_amended, cd, filer_id) except ParserMissingError: msg = 'process_filing_body: Unknown line type in filing %s line %s: type=%s Skipping.' % (filingnum, linenum, row[0]) print msg logger.warn(msg) continue # commit all the leftovers cd.commit_all() cd.close() counter = cd.get_counter() total_rows = 0 for i in counter: total_rows += counter[i] msg = "process_filing_body: Filing # %s Total rows: %s Tally is: %s" % (filingnum, total_rows, counter) print msg logger.info(msg) ######## DIRECT DB UPDATES. PROBABLY A BETTER APPROACH, BUT... header_data = dict_to_hstore(counter) cmd = "update efilings_filing set lines_present='%s'::hstore where filing_number=%s" % (header_data, filingnum) cursor.execute(cmd) # mark file as having been entered. cmd = "update efilings_filing set data_is_processed='1' where filing_number=%s" % (filingnum) cursor.execute(cmd) # flag this filer as one who has changed. cmd = "update efilings_committee set is_dirty=True where fec_id='%s' and cycle='%s'" % (filer_id, CURRENT_CYCLE) cursor.execute(cmd) # should also update the candidate is dirty flag too by joining w/ ccl table. # these tables aren't indexed, so do as two separate queries. cmd = "select cand_id from ftpdata_candcomlink where cmte_id = '%s' and cmte_dsgn in ('A', 'P')" % (filer_id) cursor.execute(cmd) result = cursor.fetchone() if result: cand_id = result[0] cmd = "update efilings_candidate set is_dirty=True where fec_id = '%s' and cycle='%s'" % (cand_id, CURRENT_CYCLE) cursor.execute(cmd) connection.close()