Пример #1
0
 def __init__(self, queue, id, starttime):
     super(Worker, self).__init__()
     self.queue = queue
     self.id = id
     self.starttime = starttime
     # give each worker it's own fp -- does it need it ?
     self.fp = form_parser()
Пример #2
0
 def __init__(self, queue, id, starttime):
     super(Worker, self).__init__()
     self.queue= queue
     self.id = id
     self.starttime = starttime
     # give each worker it's own fp -- does it need it ? 
     self.fp = form_parser()
Пример #3
0
 def handle(self, *args, **options):
     # get all the fec filings with IEs in them:
     all_ie_filings = new_filing.objects.filter(tot_ies__gt=0,
                                                filed_date__lt=date(
                                                    2014, 1, 14))
     fp = form_parser()
     for f in all_ie_filings:
         fix_dissemination_date(f, fp)
Пример #4
0
    def handle(self, *args, **options):

        fp = form_parser()

        filings_to_queue = Filing.objects.filter(
            filing_is_downloaded="1",
            header_is_processed="1",
            previous_amendments_processed="1",
            data_is_processed="0").order_by('filing_number').exclude(
                form_type__in=excluded_filings_list)
        for filing in filings_to_queue:
            ######### don't actually do this
            msg = "send_body_row_jobs: Adding filing %s to entry queue" % (
                filing.filing_number)
            print msg
            logger.info(msg)
            #process_filing_body_celery.apply_async([filing.filing_number], queue='slow',routing_key="slow")

            # Passing in the fp means we don't have to create a new one each time
            # Giving it the logger will record output in django logs.
            process_filing_body(filing.filing_number, fp, logger)
Пример #5
0
    def handle(self, *args, **options):

        fp = form_parser()

        filings_to_queue = (
            Filing.objects.filter(
                filing_is_downloaded="1",
                header_is_processed="1",
                previous_amendments_processed="1",
                data_is_processed="0",
            )
            .order_by("filing_number")
            .exclude(form_type__in=excluded_filings_list)
        )
        for filing in filings_to_queue:
            ######### don't actually do this
            msg = "send_body_row_jobs: Adding filing %s to entry queue" % (filing.filing_number)
            print msg
            logger.info(msg)
            # process_filing_body_celery.apply_async([filing.filing_number], queue='slow',routing_key="slow")

            # Passing in the fp means we don't have to create a new one each time
            # Giving it the logger will record output in django logs.
            process_filing_body(filing.filing_number, fp, logger)
def process_filing_body(filingnum, fp=None, logger=None):
    
    
    #It's useful to pass the form parser in when running in bulk so we don't have to keep creating new ones. 
    if not fp:
      fp = form_parser()
      
    if not logger:
        logger=fec_logger()
    msg = "process_filing_body: Starting # %s" % (filingnum)
    #print msg
    logger.info(msg)
      
    connection = get_connection()
    cursor = connection.cursor()
    cmd = "select fec_id, is_superceded, data_is_processed from fec_alerts_new_filing where filing_number=%s" % (filingnum)
    cursor.execute(cmd)
    
    cd = CSV_dumper(connection)
    
    result = cursor.fetchone()
    if not result:
        msg = 'process_filing_body: Couldn\'t find a new_filing for filing %s' % (filingnum)
        logger.error(msg)
        raise FilingHeaderDoesNotExist(msg)
        
    # will throw a TypeError if it's missing.
    header_id = 1
    is_amended = result[1]
    is_already_processed = result[2]
    if is_already_processed:
        msg = 'process_filing_body: This filing has already been entered.'
        logger.error(msg)
        raise FilingHeaderAlreadyProcessed(msg)
    
    #print "Processing filing %s" % (filingnum)
    try:
        f1 = filing(filingnum)
    except:
        print "*** couldn't handle filing %s" % (filingnum)
        return False
    form = f1.get_form_type()
    version = f1.get_version()
    filer_id = f1.get_filer_id()
    
    # only parse forms that we're set up to read
    
    if not fp.is_allowed_form(form):
        if verbose:
            msg = "process_filing_body: Not a parseable form: %s - %s" % (form, filingnum)
            # print msg
            logger.error(msg)
        return None
        
    linenum = 0
    while True:
        linenum += 1
        row = f1.get_body_row()
        if not row:
            break
        
        #print "row is %s" % (row)
        #print "\n\n\nForm is %s" % form
        try:
            linedict = fp.parse_form_line(row, version)
            if linedict['form_type'].upper().startswith('SE'):
                print "\n\n\nfiling %s form is %s transaction_id is: %s" % (filingnum, linedict['form_type'], linedict['transaction_id'])
                # make sure the transaction isn't already there before entering. 
                try:
                    SkedE.objects.get(filing_number=filingnum, transaction_id=linedict['transaction_id'])
                except SkedE.DoesNotExist:
                    process_body_row(linedict, filingnum, header_id, is_amended, cd, filer_id)

            elif linedict['form_type'].upper().startswith('SA'):
                print "\n\n\nfiling %s form is %s transaction_id is: %s" % (filingnum, linedict['form_type'], linedict['transaction_id'])
                # make sure the transaction isn't already there before entering. 
                try:
                    SkedA.objects.get(filing_number=filingnum, transaction_id=linedict['transaction_id'])
                    print "Already present! %s form is %s transaction_id is: %s" % (filingnum, linedict['form_type'], linedict['transaction_id'])
                except SkedA.DoesNotExist:
                    process_body_row(linedict, filingnum, header_id, is_amended, cd, filer_id)


            elif linedict['form_type'].upper().startswith('SB'):
                print "\n\n\nfiling %s form is %s transaction_id is: %s" % (filingnum, linedict['form_type'], linedict['transaction_id'])
                # make sure the transaction isn't already there before entering. 
                try:
                    SkedB.objects.get(filing_number=filingnum, transaction_id=linedict['transaction_id'])
                    print "Already present! %s form is %s transaction_id is: %s" % (filingnum, linedict['form_type'], linedict['transaction_id'])
                except SkedB.DoesNotExist:
                    process_body_row(linedict, filingnum, header_id, is_amended, cd, filer_id)
            
            
        except ParserMissingError:
            msg = 'process_filing_body: Unknown line type in filing %s line %s: type=%s Skipping.' % (filingnum, linenum, row[0])
            logger.warn(msg)
            continue
        except KeyError:
            "missing form type? in filing %s" % (filingnum)
    
    # commit all the leftovers
    cd.commit_all()
    cd.close()
    counter = cd.get_counter()
    total_rows = 0
    for i in counter:
        total_rows += counter[i]
        
    msg = "process_filing_body: Filing # %s Total rows: %s Tally is: %s" % (filingnum, total_rows, counter)
    # print msg
    logger.info(msg)
    
    
    # don't commit during testing of fix 
    
    # this data has been moved here. At some point we should pick a single location for this data. 
    header_data = dict_to_hstore(counter)
    cmd = "update fec_alerts_new_filing set lines_present='%s'::hstore where filing_number=%s" % (header_data, filingnum)
    cursor.execute(cmd)
    
    # mark file as having been entered. 
    cmd = "update fec_alerts_new_filing set data_is_processed = True where filing_number=%s" % (filingnum)
    cursor.execute(cmd)
    
    # flag this filer as one who has changed. 
    cmd = "update summary_data_committee_overlay set is_dirty=True where fec_id='%s'" % (filer_id)
    cursor.execute(cmd)
    
    # mark file as having been entered. 
    cmd = "update fec_alerts_new_filing set data_is_processed = True where filing_number=%s" % (filingnum)
    cursor.execute(cmd)
    
    # flag this filer as one who has changed. 
    cmd = "update summary_data_committee_overlay set is_dirty=True where fec_id='%s'" % (filer_id)
    cursor.execute(cmd)
    
    #


if __name__ == '__main__':
    #filings = new_filing.objects.filter(filing_number__gt=1007393, data_is_processed=False, filing_is_downloaded=True, header_is_processed=True)
    
    fp = form_parser()
    
    filings = [1010304,]
    for this_filing in filings:

        process_filing_body(this_filing, fp=fp)


"""
t0 = time.time()
process_filing_body(864353)
# 869853, 869866
#for fn in [869888]:
#    process_filing_body(fn, fp)
t1 = time.time()
print "total time = " + str(t1-t0)
Пример #8
0
from django.core.management.base import BaseCommand, CommandError

from parsing.form_parser import form_parser, ParserMissingError
from parsing.filing import filing
from parsing.read_FEC_settings import FILECACHE_DIRECTORY

from fec_alerts.models import new_filing

#from formdata.models import Filing_Header
from fec_alerts.utils.filing_processors import process_new_filing




# load up a form parser
fp = form_parser()


class Command(BaseCommand):
    help = "Enter file headers; don't mark them as either amended or not."
    requires_model_validation = False

    def handle(self, *args, **options):
        downloaded_filings = new_filing.objects.filter(filing_is_downloaded=True, header_is_processed=False).order_by('filing_number')
        for filing in downloaded_filings:
            print "Entering filing %s, entry_time %s" % (filing.filing_number, filing.process_time)
            result_header = None
            try: 
                result_header = process_new_filing(filing, fp=fp, filing_time=filing.process_time, filing_time_is_exact=True)
            ## It seems like the FEC's response is now to give a page not found response instead of a 500 error or something. The result is that the except no longer seems to apply. 
            except IOError:
def process_filing_body(filingnum, fp=None, logger=None):

    # It's useful to pass the form parser in when running in bulk so we don't have to keep creating new ones.
    if not fp:
        fp = form_parser()

    if not logger:
        logger = fec_logger()
    msg = "process_filing_body: Starting # %s" % (filingnum)
    # print msg
    logger.info(msg)

    connection = get_connection()
    cursor = connection.cursor()
    cmd = "select fec_id, superseded_by_amendment, data_is_processed from efilings_filing where filing_number=%s" % (
        filingnum
    )
    cursor.execute(cmd)

    cd = CSV_dumper(connection)

    result = cursor.fetchone()
    if not result:
        msg = "process_filing_body: Couldn't find a new_filing for filing %s" % (filingnum)
        logger.error(msg)
        raise FilingHeaderDoesNotExist(msg)

    # will throw a TypeError if it's missing.
    line_sequence = 1
    is_amended = result[1]
    is_already_processed = result[2]
    if is_already_processed == "1":
        msg = "process_filing_body: This filing has already been entered."
        print msg
        logger.error(msg)
        raise FilingHeaderAlreadyProcessed(msg)

    # print "Processing filing %s" % (filingnum)
    f1 = filing(filingnum)
    form = f1.get_form_type()
    version = f1.get_version()
    filer_id = f1.get_filer_id()

    # only parse forms that we're set up to read

    if not fp.is_allowed_form(form):
        if verbose:
            msg = "process_filing_body: Not a parseable form: %s - %s" % (form, filingnum)
            print msg
            logger.info(msg)
        return None

    linenum = 0
    while True:
        linenum += 1
        row = f1.get_body_row()
        if not row:
            break

        # print "row is %s" % (row)
        # print "\n\n\nForm is %s" % form
        try:
            linedict = fp.parse_form_line(row, version)
            # print "\n\n\nform is %s" % form
            process_body_row(linedict, filingnum, line_sequence, is_amended, cd, filer_id)
        except ParserMissingError:
            msg = "process_filing_body: Unknown line type in filing %s line %s: type=%s Skipping." % (
                filingnum,
                linenum,
                row[0],
            )
            print msg
            logger.warn(msg)
            continue

    # commit all the leftovers
    cd.commit_all()
    cd.close()
    counter = cd.get_counter()
    total_rows = 0
    for i in counter:
        total_rows += counter[i]

    msg = "process_filing_body: Filing # %s Total rows: %s Tally is: %s" % (filingnum, total_rows, counter)
    print msg
    logger.info(msg)

    ######## DIRECT DB UPDATES. PROBABLY A BETTER APPROACH, BUT...

    header_data = dict_to_hstore(counter)
    cmd = "update efilings_filing set lines_present='%s'::hstore where filing_number=%s" % (header_data, filingnum)
    cursor.execute(cmd)

    # mark file as having been entered.
    cmd = "update efilings_filing set data_is_processed='1' where filing_number=%s" % (filingnum)
    cursor.execute(cmd)

    # flag this filer as one who has changed.
    cmd = "update efilings_committee set is_dirty=True where fec_id='%s' and cycle='%s'" % (filer_id, CURRENT_CYCLE)
    cursor.execute(cmd)

    # should also update the candidate is dirty flag too by joining w/ ccl table.
    # these tables aren't indexed, so do as two separate queries.
    cmd = "select cand_id from ftpdata_candcomlink where cmte_id = '%s' and cmte_dsgn in ('A', 'P')" % (filer_id)
    cursor.execute(cmd)
    result = cursor.fetchone()
    if result:
        cand_id = result[0]
        cmd = "update efilings_candidate set is_dirty=True where fec_id = '%s' and cycle='%s'" % (
            cand_id,
            CURRENT_CYCLE,
        )
        cursor.execute(cmd)

    connection.close()
Пример #10
0
 def handle(self, *args, **options):
     # get all the fec filings with IEs in them:
     all_ie_filings = new_filing.objects.filter(tot_ies__gt=0, filed_date__lt=date(2014,1,14))
     fp = form_parser()
     for f in all_ie_filings:
         fix_dissemination_date(f, fp)
Пример #11
0
def process_new_filing(thisnewfiling, fp=None, filing_time=None, filing_time_is_exact=False):
    """ Enter the file header if needed.  """
       
    if not fp:
        fp = form_parser()
        
    #print "Processing filing %s" % (filingnum)
    f1 = filing(thisnewfiling.filing_number)
    if f1.get_error():
        return False
        
    form = f1.get_form_type()
    version = f1.get_version()

    ## leave the form if it's already been entered-- that's where it says if it is terminated. 
    if not thisnewfiling.form_type:
        thisnewfiling.form_type = form
        
    # check if it's an amendment based on form types -- if so, mark it. Otherwise the F1's will look like they haven't been amended. 
    try:
        if thisnewfiling.form_type[-1].upper() == 'A':
            thisnewfiling.is_amendment = True
    except IndexError:
        pass

    # only parse forms that we're set up to read
    if not fp.is_allowed_form(form):
        if verbose:
            print "Not a parseable form: %s - %s" % (form, thisnewfiling.filing_number)
        
        if thisnewfiling.is_amendment:
            thisnewfiling.save()
        return True

    header = f1.get_first_row()
    header_line = fp.parse_form_line(header, version)

    amended_filing=None
    if f1.is_amendment:
        amended_filing = f1.headers['filing_amended']


    
    from_date = None
    through_date = None
    #print "header line is: %s " % header_line
    try:
        # dateparse('') will give today, oddly
        if header_line['coverage_from_date']:
            from_date = dateparse(header_line['coverage_from_date'])
            if from_date:
                thisnewfiling.cycle = get_cycle_from_date(from_date)
    except KeyError:
        print "problem with coverage_from_date"
        pass
        
    try:                
        if header_line['coverage_through_date']:
            through_date = dateparse(header_line['coverage_through_date'])
            if through_date:
                thisnewfiling.cycle = get_cycle_from_date(through_date)
    except KeyError:
        print "coverage_through_date"
        pass

    
    # Create the filing -- but don't mark it as being complete. 
    

    
    
    
    thisnewfiling.fec_id = f1.headers['fec_id']
    thisnewfiling.coverage_from_date = from_date
    thisnewfiling.coverage_to_date = through_date
    thisnewfiling.is_amendment = f1.is_amendment
    thisnewfiling.amends_filing = amended_filing
    thisnewfiling.amendment_number = f1.headers['report_number'] or None
    thisnewfiling.header_data = header_line
    
    print thisnewfiling.__dict__

    thisnewfiling.save()
    
    return True
Пример #12
0
def process_filing_body(filingnum, fp=None, logger=None):

    #It's useful to pass the form parser in when running in bulk so we don't have to keep creating new ones.
    if not fp:
        fp = form_parser()

    if not logger:
        logger = fec_logger()
    msg = "process_filing_body: Starting # %s" % (filingnum)
    #print msg
    logger.info(msg)

    connection = get_connection()
    cursor = connection.cursor()
    cmd = "select fec_id, is_superceded, data_is_processed from fec_alerts_new_filing where filing_number=%s" % (
        filingnum)
    cursor.execute(cmd)

    cd = CSV_dumper(connection)

    result = cursor.fetchone()
    if not result:
        msg = 'process_filing_body: Couldn\'t find a new_filing for filing %s' % (
            filingnum)
        logger.error(msg)
        raise FilingHeaderDoesNotExist(msg)

    # will throw a TypeError if it's missing.
    header_id = 1
    is_amended = result[1]
    is_already_processed = result[2]
    if is_already_processed:
        msg = 'process_filing_body: This filing has already been entered.'
        logger.error(msg)
        raise FilingHeaderAlreadyProcessed(msg)

    #print "Processing filing %s" % (filingnum)
    f1 = filing(filingnum)
    form = f1.get_form_type()
    version = f1.get_version()
    filer_id = f1.get_filer_id()

    # only parse forms that we're set up to read

    if not fp.is_allowed_form(form):
        if verbose:
            msg = "process_filing_body: Not a parseable form: %s - %s" % (
                form, filingnum)
            # print msg
            logger.error(msg)
        return None

    linenum = 0
    while True:
        linenum += 1
        row = f1.get_body_row()
        if not row:
            break

        #print "row is %s" % (row)
        #print "\n\n\nForm is %s" % form
        try:
            linedict = fp.parse_form_line(row, version)
            #print "\n\n\nform is %s" % form
            process_body_row(linedict, filingnum, header_id, is_amended, cd,
                             filer_id)
        except ParserMissingError:
            msg = 'process_filing_body: Unknown line type in filing %s line %s: type=%s Skipping.' % (
                filingnum, linenum, row[0])
            logger.warn(msg)
            continue

    # commit all the leftovers
    cd.commit_all()
    cd.close()
    counter = cd.get_counter()
    total_rows = 0
    for i in counter:
        total_rows += counter[i]

    msg = "process_filing_body: Filing # %s Total rows: %s Tally is: %s" % (
        filingnum, total_rows, counter)
    # print msg
    logger.info(msg)

    # this data has been moved here. At some point we should pick a single location for this data.
    header_data = dict_to_hstore(counter)
    cmd = "update fec_alerts_new_filing set lines_present='%s'::hstore where filing_number=%s" % (
        header_data, filingnum)
    cursor.execute(cmd)

    # mark file as having been entered.
    cmd = "update fec_alerts_new_filing set data_is_processed = True where filing_number=%s" % (
        filingnum)
    cursor.execute(cmd)

    # flag this filer as one who has changed.
    cmd = "update summary_data_committee_overlay set is_dirty=True where fec_id='%s'" % (
        filer_id)
    cursor.execute(cmd)
Пример #13
0
def process_filing_body(filingnum, fp=None, logger=None):
    
    
    #It's useful to pass the form parser in when running in bulk so we don't have to keep creating new ones. 
    if not fp:
      fp = form_parser()
      
    if not logger:
        logger=fec_logger()
    msg = "process_filing_body: Starting # %s" % (filingnum)
    #print msg
    logger.info(msg)
      
    connection = get_connection()
    cursor = connection.cursor()
    cmd = "select fec_id, superseded_by_amendment, data_is_processed from efilings_filing where filing_number=%s" % (filingnum)
    cursor.execute(cmd)
    
    cd = CSV_dumper(connection)
    
    result = cursor.fetchone()
    if not result:
        msg = 'process_filing_body: Couldn\'t find a new_filing for filing %s' % (filingnum)
        logger.error(msg)
        raise FilingHeaderDoesNotExist(msg)
        
    # will throw a TypeError if it's missing.
    line_sequence = 1
    is_amended = result[1]
    is_already_processed = result[2]
    if is_already_processed == "1":
        msg = 'process_filing_body: This filing has already been entered.'
        print msg
        logger.error(msg)
        raise FilingHeaderAlreadyProcessed(msg)
    
    #print "Processing filing %s" % (filingnum)
    f1 = filing(filingnum)
    form = f1.get_form_type()
    version = f1.get_version()
    filer_id = f1.get_filer_id()
    
    # only parse forms that we're set up to read
    
    if not fp.is_allowed_form(form):
        if verbose:
            msg = "process_filing_body: Not a parseable form: %s - %s" % (form, filingnum)
            print msg
            logger.info(msg)
        return None
        
    linenum = 0
    while True:
        linenum += 1
        row = f1.get_body_row()
        if not row:
            break
        
        #print "row is %s" % (row)
        #print "\n\n\nForm is %s" % form
        try:
            linedict = fp.parse_form_line(row, version)
            #print "\n\n\nform is %s" % form
            process_body_row(linedict, filingnum, line_sequence, is_amended, cd, filer_id)
        except ParserMissingError:
            msg = 'process_filing_body: Unknown line type in filing %s line %s: type=%s Skipping.' % (filingnum, linenum, row[0])
            print msg
            logger.warn(msg)
            continue
        
    # commit all the leftovers
    cd.commit_all()
    cd.close()
    counter = cd.get_counter()
    total_rows = 0
    for i in counter:
        total_rows += counter[i]
        
    msg = "process_filing_body: Filing # %s Total rows: %s Tally is: %s" % (filingnum, total_rows, counter)
    print msg
    logger.info(msg)
    
    ######## DIRECT DB UPDATES. PROBABLY A BETTER APPROACH, BUT... 
    
    header_data = dict_to_hstore(counter)
    cmd = "update efilings_filing set lines_present='%s'::hstore where filing_number=%s" % (header_data, filingnum)
    cursor.execute(cmd)
    
    # mark file as having been entered. 
    cmd = "update efilings_filing set data_is_processed='1' where filing_number=%s" % (filingnum)
    cursor.execute(cmd)
    
    # flag this filer as one who has changed. 
    cmd = "update efilings_committee set is_dirty=True where fec_id='%s' and cycle='%s'" % (filer_id, CURRENT_CYCLE)
    cursor.execute(cmd)
    
    # should also update the candidate is dirty flag too by joining w/ ccl table. 
    # these tables aren't indexed, so do as two separate queries. 
    cmd = "select cand_id from ftpdata_candcomlink where cmte_id = '%s' and cmte_dsgn in ('A', 'P')" % (filer_id)
    cursor.execute(cmd)
    result = cursor.fetchone()
    if result:
        cand_id = result[0]
        cmd = "update efilings_candidate set is_dirty=True where fec_id = '%s' and cycle='%s'" % (cand_id, CURRENT_CYCLE)
        cursor.execute(cmd)

    connection.close()