예제 #1
0
파일: main.py 프로젝트: FD253/data
def process(request):
    if request.headers.get('x-api-key', '') != API_KEY:
        return 'Not found', 404

    xml_runner = XMLRunner()
    try:
        filing = xml_runner.run_filing(request.args.get('aws_object_id',''))
    except RuntimeError as e:
        return "Error getting XML: {0}".format(str(e)), 400

    try:
        if 'IRS990PF' in filing.list_schedules():
            org = org_from_990pf(filing)
            grants_to_create = grants_from_990pf(filing)
        elif 'IRS990EZ' in filing.list_schedules():
            org = org_from_990ez(filing)
            grants_to_create = []
        elif 'IRS990' in filing.list_schedules():
            org = org_from_990(filing)
            grants_to_create = grants_from_990(filing)
        else:
            raise RuntimeError('No schedule available to parse.')
    except RuntimeError as e:
        return "Error getting org: {0}".format(str(e)), 500

    if org.get('ein', '') == '':
        return "No EIN found", 500

    client = MongoClient(MONGO_URL)
    db = client.springsteen

    timestamp = timestamp_now()
    org['updatedAt'] = timestamp

    existing_org = db.organizations.find_one({'ein': org['ein']})
    if existing_org == None:
        org['createdAt'] = timestamp
        result = db.organizations.insert_one(org)
        org_mongo_id = result.inserted_id
    else:
        org_mongo_id = existing_org['_id']
        if 'lastFilingAt' not in existing_org or parse(existing_org['lastFilingAt']) < parse(org['lastFilingAt']):
            merged_org = {**existing_org, **org}
            if 'createdAt' not in merged_org or merged_org['createdAt'] == 'yo':
                merged_org['createdAt'] = timestamp
            result = db.organizations.find_one_and_update({'_id': existing_org['_id']}, {'$set': merged_org}, return_document=ReturnDocument.AFTER)

    for grant in grants_to_create:
        grant['funder'] = DBRef('organizations', org_mongo_id)
        grant['createdAt'] = timestamp
        grant['updatedAt'] = timestamp

    if len(grants_to_create) > 0:
        # Grants should not be replaced if they are already uploaded for that tax period/funder since they can be modified by other sources after initial upload
        if db.grants.find_one({'funderEIN': org['ein'], 'fromTaxPeriodEnding': grants_to_create[0]['fromTaxPeriodEnding']}) == None:
            result = db.grants.delete_many({'funderEIN': org['ein'], 'fromTaxPeriodEnding': grants_to_create[0]['fromTaxPeriodEnding']})
            result = db.grants.insert_many(grants_to_create)

    return 'OK'
예제 #2
0
 def run(self):
     self.xml_runner = XMLRunner()
     self.accumulator = Accumulator()
     while True:
         filing = self.queue.get()
         self.run_filing(filing)
         self.queue.task_done()
     connection.close()
예제 #3
0
 def run(self):
     self.xml_runner = XMLRunner()
     self.accumulator = Accumulator()
     while True:
          filing = self.queue.get()
          try:
              self.run_filing(filing)
          except Exception as ex:
              print(ex)
          finally:
              self.queue.task_done()
     connection.close()
예제 #4
0
    def handle(self, *args, **options):
        self.xml_runner = None
        self.standardizer = Standardizer()
        count = 0

        submissions = XMLSubmission.objects.filter(
            schema_year__gte=2013,
            sub_date__contains='2017').values('taxpayer_name', 'tax_period',
                                              'sub_date', 'object_id')
        for submission in submissions:

            count += 1
            if count % 100 == 0:
                print("Processed %s filings" % count)
                reset_queries()  # not sure this will matter, but...
                self.xml_runner = None  # Erase this to prevent memory leaks

            if not self.xml_runner:
                # will start up faster if we don't have to reread/import csvs
                self.xml_runner = XMLRunner(standardizer=self.standardizer)

            whole_submission = XMLSubmission.objects.get(
                object_id=submission['object_id'])

            if type(whole_submission.as_json) == unicodeType:
                submission_json = json.loads(whole_submission.as_json)
            else:
                # Assume it's a dict?
                # We don't have any "working" installations that return json as json
                submission_json = whole_submission.as_json

            filingobj = Filing(submission['object_id'], json=submission_json)

            parsedFiling = self.xml_runner.run_from_filing_obj(
                filingobj,
                verbose=False,
            )
            result = parsedFiling.get_result()
            keyerrors = parsedFiling.get_keyerrors()
            has_keyerrors = len(keyerrors) > 0

            try:
                ProcessedFiling.objects.get(object_id=submission['object_id'])
            except ProcessedFiling.DoesNotExist:
                ProcessedFiling.objects.create(
                    ein=whole_submission.ein,
                    object_id=whole_submission.object_id,
                    processed_json=result,
                    keyerrors=keyerrors,
                    has_keyerrors=has_keyerrors,
                    submission=whole_submission)
예제 #5
0
    def handle(self, *args, **options):
        self.xml_runner = XMLRunner()
        self.accumulator = Accumulator()

        count = 0
        while True:
            xml_batch = XMLSubmission.objects.filter(
                year__in=[2014, 2015, 2016, 2017]).exclude(
                    json_set=True)[:BATCHSIZE]
            #xml_batch = XMLSubmission.objects.filter(object_id__in=ids).exclude(json_set=True)[:BATCHSIZE]
            #xml_batch = XMLSubmission.objects.filter(object_id__in=['201540859349100204',])[:BATCHSIZE]
            #xml_batch = XMLSubmission.objects.filter(object_id__in=test2016ids).exclude(json_set=True)[:BATCHSIZE]
            #xml_batch = XMLSubmission.objects.filter(sub_date__regex=r'^8.+2017.*').exclude(json_set=True)[:BATCHSIZE]
            #xml_batch = XMLSubmission.objects.filter(object_id__in=test2016ids).exclude(json_set=True)[:BATCHSIZE]
            #xml_batch = XMLSubmission.objects.filter(object_id__in=['201523179349302022',]).exclude(json_set=True)

            count += 1
            print count
            if len(xml_batch) == 0:
                break

            self.process_batch(xml_batch)

            # for testing
            if count > LOOP_MAX:
                break
예제 #6
0
def fetch_yr_ind(oid_srch_lst):

    # Should we use IRSx or manual concordance? Setup IRSx if using it
    # Requires all object IDs in the file to be from the same year
    irsx_flag = True if int(oid_srch_lst[0][:4]) >= 2015 else False
    xml_runner = XMLRunner() if irsx_flag else None

    yr_ind_new = pd.DataFrame()

    # Iterate through Object IDs and update regularly
    start_time = time.time()
    counter = 0
    for oid in oid_srch_lst:
        yr_ind_new = yr_ind_new.append(
            fetch_ind_row(irsx_flag, xml_runner, oid))
        if counter % upd_intvl == 0:
            elapsed = time.time() - start_time
            logging.info(
                "Read {} forms from current year in {:,.1f} seconds.".format(
                    counter, elapsed))
        counter += 1

    yr_ind_new['990_SRC'] = "AWS FILE DIR"

    return yr_ind_new
예제 #7
0
    def __init__(self, object_id, obj_tbl_field_map=None):
        self.object_id = object_id
        self.xml_runner = XMLRunner()
        self.obj_tbl_field_map = obj_tbl_field_map

        self.header_dict = self.process_header_fields()
        self.balance_dict = self.process_balance_fields()
        self.people = self.process_compensation_fields()

        self.failures = {
            'comp': True if self.people is None else False,
            'balance': True if self.balance_dict is None else False,
            'header': True if self.header_dict is None else False
        }
예제 #8
0
def fetch_yr_ind( oid_srch_lst ):

    # Should we use IRSx or manual concordance? Setup IRSx if using it
    # Requires all object IDs in the file to be from the same year
    irsx_flag = True if int( oid_srch_lst[0][:4] ) >= 2015 else False
    xml_runner = XMLRunner() if irsx_flag else None
        
    yr_ind_new = pd.DataFrame()
    
    # Iterate through Object IDs
    for oid in oid_srch_lst:
        yr_ind_new = yr_ind_new.append( fetch_ind_row( irsx_flag, xml_runner, oid ) )
    
    yr_ind_new['990_SRC'] = "AWS FILE DIR"
    
    return yr_ind_new
예제 #9
0
    def handle(self, *args, **options):
        self.xml_runner = None
        #self.fix_connection()
        self.standardizer = Standardizer()
        count = 0
        headers = [
            "taxpayer_name", "ein", "tax_period", "sub_date", "object_id",
            "name", "title", "org_comp", "related_comp", "other_cmp", "form",
            "source"
        ]

        outfile = open("dumptest.csv", 'wb')
        dw = csv.DictWriter(outfile, fieldnames=headers, extrasaction='ignore')
        dw.writeheader()

        submissions = XMLSubmission.objects.filter(
            schema_year__gte=2013,
            sub_date__contains='2017').values('taxpayer_name', 'tax_period',
                                              'sub_date', 'object_id')
        #submissions = XMLSubmission.objects.filter(object_id='201513209349102976').values('taxpayer_name', 'tax_period', 'sub_date', 'object_id')
        #submissions = XMLSubmission.objects.filter(return_type='990PF').values('taxpayer_name', 'tax_period', 'sub_date', 'object_id')
        for submission in submissions:

            count += 1
            if count % 100 == 0:
                print("Processed %s filings" % count)
                reset_queries()  # not sure this will matter, but...
                self.xml_runner = None  # Erase this to prevent memory leaks

            if not self.xml_runner:
                self.xml_runner = XMLRunner(
                    standardizer=self.standardizer
                )  # will start up faster if we don't have to reread/import csvs

            whole_submission = XMLSubmission.objects.get(
                object_id=submission['object_id'])
            assert whole_submission.json_set

            # There's a bug that makes json objects get returned as unicode instead of as dicts
            # similar to this one https://code.djangoproject.com/ticket/27675
            # though django-jsonfield isn't used in this object
            # See to register_json, though that doesn't work in this context
            # http://initd.org/psycopg/docs/extras.html

            if type(whole_submission.as_json) == unicodeType:
                submission_json = json.loads(whole_submission.as_json)
            else:
                # Assume it's a dict? We haven't seen this yet.
                submission_json = whole_submission.as_json

            filingobj = Filing(submission['object_id'], json=submission_json)
            #print("\n\nObject id %s\n" % submission['object_id'])
            #print submission_json

            processedFiling = self.xml_runner.run_from_filing_obj(
                filingobj,
                verbose=False,
            )

            #print ("\n\nProcessed filing is %s" % processedFiling.get_result())

            filing_info = {
                'taxpayer_name': submission['taxpayer_name'],
                'tax_period': submission['tax_period'],
                'sub_date': submission['sub_date']
            }
            schedule_list = processedFiling.list_schedules()
            result = processedFiling.get_result()
            keyerrors = processedFiling.get_keyerrors()
            if keyerrors:
                print("\n\n\n***keyerrors\n\n%s" % keyerrors)

            sked990_list = processedFiling.get_parsed_sked('IRS990')
            sked990EZ_list = processedFiling.get_parsed_sked('IRS990EZ')
            sked990PF_list = processedFiling.get_parsed_sked('IRS990PF')
            sked990J_list = processedFiling.get_parsed_sked('IRS990ScheduleJ')

            if sked990_list:
                #print("\n\t990")
                sked990 = sked990_list[0]
                assert sked990['schedule_name'] == 'IRS990'
                group_name = "Frm990PrtVIISctnA"
                try:
                    employee_list = sked990['groups'][group_name]
                except KeyError:
                    employee_list = []

                for employee in employee_list:
                    #print "\n\n"
                    #print employee
                    this_employee = {
                        'ein': employee['ein'],
                        'object_id': employee['object_id'],
                        'name': employee.get('PrsnNm'),
                        'title': employee.get('TtlTxt'),
                        'org_comp': employee.get('RprtblCmpFrmOrgAmt', 0),
                        'related_comp': employee.get('RprtblCmpFrmRltdOrgAmt',
                                                     0),
                        'other_cmp': employee.get('OthrCmpnstnAmt', 0),
                        'highest_ind': employee.get('HghstCmpnstdEmplyInd'),
                        'form': 'IRS990',
                        'source': 'Frm990PrtVIISctnA'
                    }
                    this_employee.update(filing_info)
                    #print "\n"
                    #print this_employee
                    dw.writerow(this_employee)

            if sked990EZ_list:
                sked990EZ = sked990EZ_list[0]
                #print("\n\t990EZ %s" % sked990EZ['schedule_name'])
                assert sked990EZ['schedule_name'] == 'IRS990EZ'
                group_name = "EZOffcrDrctrTrstEmpl"

                try:
                    employee_list = sked990EZ['groups'][group_name]
                except KeyError:
                    employee_list = []

                for employee in employee_list:
                    #print employee
                    this_employee = {
                        'ein': employee['ein'],
                        'object_id': employee['object_id'],
                        'name': employee.get('PrsnNm', ''),
                        'title': employee.get('TtlTxt', ''),
                        'org_comp': employee.get('CmpnstnAmt', 0),
                        # 'related_comp': NA
                        #'other_cmp': EmplyBnftsAmt + ExpnsAccntAmt ?
                        'form': 'IRS990EZ',
                        'source': 'EZOffcrDrctrTrstEmpl'
                    }
                    this_employee.update(filing_info)
                    #print this_employee
                    dw.writerow(this_employee)

                ##

                group_name = "EZCmpnstnHghstPdEmpl"  # This is very rare
                try:
                    employee_list = sked990EZ['groups'][group_name]
                except KeyError:
                    employee_list = []

                for employee in employee_list:

                    this_employee = {
                        'ein': employee['ein'],
                        'object_id': employee['object_id'],
                        'name': employee.get('PrsnNm'),
                        'title': employee.get('TtlTxt'),
                        'org_comp': employee.get('CmpnstnAmt'),
                        # 'related_comp': NA
                        #'other_cmp': EmplyBnftsAmt + ExpnsAccntAmt ?
                        'form': 'IRS990EZ',
                        'source': 'EZCmpnstnHghstPdEmpl'
                    }
                    this_employee.update(filing_info)
                    print "\nEZ"
                    print employee
                    print this_employee
                    dw.writerow(this_employee)

            if sked990PF_list:
                sked990PF = sked990PF_list[0]
                #print("\n\t990PF %s" % sked990PF['schedule_name'])
                assert sked990PF['schedule_name'] == 'IRS990PF'

                group_name = "PFOffcrDrTrstKyEmpl"
                employee_list = []
                try:
                    employee_list = sked990PF['groups'][group_name]
                except KeyError:
                    pass

                for employee in employee_list:
                    #print "\n\n"
                    #print employee
                    this_employee = {
                        'ein': employee['ein'],
                        'object_id': employee['object_id'],
                        'name': employee.get('OffcrDrTrstKyEmpl_PrsnNm'),
                        'title': employee.get('OffcrDrTrstKyEmpl_TtlTxt'),
                        'org_comp':
                        employee.get('OffcrDrTrstKyEmpl_CmpnstnAmt'),
                        # 'related_comp': NA
                        #'other_cmp': OffcrDrTrstKyEmpl_EmplyBnftPrgrmAmt + OffcrDrTrstKyEmpl_ExpnsAccntOthrAllwncAmt ?
                        'form': 'IRS990PF',
                        'source': 'PFOffcrDrTrstKyEmpl'
                    }
                    this_employee.update(filing_info)
                    #print "\n"
                    #print this_employee
                    dw.writerow(this_employee)

                group_name = "PFCmpnstnHghstPdEmpl"  # also rare
                employee_list = []
                try:
                    employee_list = sked990PF['groups'][group_name]
                except KeyError:
                    pass

                for employee in employee_list:
                    #print employee
                    this_employee = {
                        'ein': employee['ein'],
                        'object_id': employee['object_id'],
                        'name': employee.get('CmpnstnHghstPdEmpl_PrsnNm'),
                        'title': employee.get('CmpnstnHghstPdEmpl_TtlTxt'),
                        'org_comp':
                        employee.get('CmpnstnHghstPdEmpl_CmpnstnAmt'),
                        # 'related_comp': NA
                        #'other_cmp': CmpnstnHghstPdEmpl_EmplyBnftsAmt + CmpnstnHghstPdEmpl_ExpnsAccntAmt ?
                        'form': 'IRS990PF',
                        'source': 'PFCmpnstnHghstPdEmpl'
                    }
                    this_employee.update(filing_info)
                    #print "\n"
                    #print this_employee
                    dw.writerow(this_employee)

            if sked990J_list:
                sked990J = sked990J_list[0]
                #print("\n\t990J %s" % sked990J['schedule_name'])
                assert sked990J['schedule_name'] == 'IRS990ScheduleJ'

                group_name = "SkdJRltdOrgOffcrTrstKyEmpl"
                employee_list = []
                try:
                    employee_list = sked990J['groups'][group_name]
                except KeyError:
                    pass

                for employee in employee_list:
                    #print "\n\n sked J"
                    #print employee
                    this_employee = {
                        'ein': employee['ein'],
                        'object_id': employee['object_id'],
                        'name': employee.get('PrsnNm'),
                        'bus_line_1': employee.get('BsnssNmLn1Txt'),
                        'title': employee.get('TtlTxt'),
                        'org_comp': employee.get('TtlCmpnstnFlngOrgAmt'),
                        'related_comp': employee.get('TtlCmpnstnRltdOrgsAmt'),
                        #'other_cmp': OffcrDrTrstKyEmpl_EmplyBnftPrgrmAmt + OffcrDrTrstKyEmpl_ExpnsAccntOthrAllwncAmt ?
                        'form': 'IRS990ScheduleJ',
                        'source': 'SkdJRltdOrgOffcrTrstKyEmpl'
                    }
                    this_employee.update(filing_info)
                    #print "\n"
                    #print this_employee
                    dw.writerow(this_employee)

        print("Total of %s processed" % count)
예제 #10
0
def federal_irs_ingest_get_990s(message, context):

    year = datetime.datetime.today().year

    # settings pulled from a database
    ref = db.collection('federal').document('irs').collection('990s').document(str(year))
    settings = ref.get().to_dict()
    if settings is not None:
        latest_saved_idx = settings['idx']
    else:
        latest_saved_idx = 0

    # prep load
    xml_runner = XMLRunner()
    start_time = time.time()
    bucket = client.get_bucket(gcp_project_id)
    blob = bucket.get_blob('downloads/federal/irs/index_' + str(year) + '.csv')
    blob = blob.download_as_string().decode('utf-8')
    blob = StringIO(blob)

    # load by looping through all the rows in the index
    actions = []
    failed_object_ids = []
    reader = csv.reader(blob, delimiter=',')
    next(reader) # skip header
    for idx, row in enumerate(reader):

        if time.time() - start_time > 520:
            break

        # skip previously indexed objectss
        if idx < latest_saved_idx:
            continue

        # process the object id
        object_id = row[8]
        if int(object_id[:4]) < 2014: # can't process these
            continue

        # process the submission date
        sub_date = row[4]
        try:
            sub_date = datetime.datetime.strptime(sub_date, '%m/%d/%Y %I:%M:%S %p')
        except:
            try:
                sub_date = datetime.datetime.strptime(sub_date, '%m/%d/%Y')
            except:
                raise

        sub_date = pytz.timezone('US/Eastern').localize(sub_date)
        sub_date = sub_date.strftime("%Y-%m-%dT%H:%M:%S%z")

        # grab the filing
        try:
            filing = xml_runner.run_filing(object_id)
            schedules = filing.get_result()
        except (RuntimeError, InvalidXMLException) as e:
            logger.error(object_id, str(e))
            failed_object_ids.append(object_id)
            continue

        if schedules is not None:

            xml = utilities.get_xml_parts(schedules)
            xml = utilities.clean_xml(xml)

            if 'IRS990EZ' in xml:
                index = '990ez'
            elif 'IRS990PF' in xml:
                index = '990pf'
            else:
                index = '990'

            actions.append({
                '_op_type': 'index',
                '_index': 'federal_irs_' + index,
                '_id': object_id,
                '_source': {
                    'row': {
                        'return_id': str(row[0]),
                        'filing_type': row[1],
                        'ein': str(row[2]),
                        'tax_period': row[3],
                        'sub_date': sub_date,
                        'taxpayer_name': row[5],
                        'return_type': str(row[6]),
                        'dln': str(row[7]),
                        'object_id': object_id
                    },
                    'obj': xml,
                    'context': {
                        'last_indexed': datetime.datetime.now(datetime.timezone.utc)
                    }
                }
            })

        if len(actions) >= 1000:
            helpers.bulk(es, actions)
            logger.info('ELASTICSEARCH UPDATED' + ' - ' + str(len(actions)) + ' docs')
            actions = []

    # index all docs into elasticsearch
    if actions:
        helpers.bulk(es, actions)
        logger.info('ELASTICSEARCH UPDATED' + ' - ' + str(len(actions)) + ' docs')

    # update Firestore
    update = {
        "idx": idx,
        "last_updated": datetime.datetime.now(datetime.timezone.utc)
    }
    if len(failed_object_ids) > 0:
        update['failed_object_ids'] = firestore.ArrayUnion(failed_object_ids)
    ref.set(update, merge=True)

    num_remaining_rows = len(list(reader))
    logger.info('FIRESTORE UPDATED - completed: ' + str(idx) + ', remaining: ' + str(num_remaining_rows))
    return num_remaining_rows
예제 #11
0
import csv
import pandas as pd
import requests.exceptions
import time
from irsx.xmlrunner import XMLRunner
from irsx.filing import InvalidXMLException

#Works for schemas: 2013v3.0 through part of 2016. Assuming that there's no way a FY2015 filing could use TY2016 schema.

timestr = time.strftime("%Y-%m-%d-%H-%M")
xml_runner = XMLRunner(documentation=True, csv_format=True)
df = pd.read_csv("files/new_ids.csv")
object_list = list(df["object_id"])

fieldnames = [
    "schema",  #May need to join with the BMF to get the foundation type.
    "object_id",
    "/ReturnHeader/ReturnTs",
    "/ReturnHeader/Filer/EIN",
    #"/ReturnHeader/Filer/Name/BusinessNameLine1", This field has had its name changed multiple times. Just use the name in the BMF.
    "/ReturnHeader/TaxPeriodEndDt",
    "/ReturnHeader/ReturnTypeCd",
    "/ReturnHeader/TaxYr",
    "/IRS990PF/AmendedReturnInd",
    "/IRS990PF/FinalReturnInd",
    "/IRS990PF/StatementsRegardingActyGrp/PrivateOperatingFoundationInd",
    "/IRS990PF/FMVAssetsEOYAmt",
    "/IRS990PF/MethodOfAccountingCashInd",
    "/IRS990PF/AnalysisOfRevenueAndExpenses/OthEmplSlrsWgsRevAndExpnssAmt",
    "/IRS990PF/AnalysisOfRevenueAndExpenses/OthEmplSlrsWgsDsbrsChrtblAmt",
    "/IRS990PF/AnalysisOfRevenueAndExpenses/TotOprExpensesRevAndExpnssAmt",
예제 #12
0
 def __init__(self, output_streams, data_capture_dict, year):
     self.year = year
     self.output_streams = output_streams
     self.data_capture_dict = data_capture_dict
     self.xml_runner = XMLRunner()
     self._init_streams()
예제 #13
0
class StreamExtractor(object):
    """Write filings to csv, specified in config.py"""
    def __init__(self, output_streams, data_capture_dict, year):
        self.year = year
        self.output_streams = output_streams
        self.data_capture_dict = data_capture_dict
        self.xml_runner = XMLRunner()
        self._init_streams()

    def _init_streams(self):
        for stream_key in self.output_streams.keys():
            this_stream = self.output_streams[stream_key]
            filename = this_stream['filename'] + str(self.year) + ".csv"
            print("Initializing output stream %s" % filename)
            outfile = open(filename, 'wb')
            dw = csv.DictWriter(outfile,
                                this_stream['headers'],
                                extrasaction='ignore')
            dw.writeheader()
            self.output_streams[stream_key]['writer'] = dw

    def run_parts(self,
                  this_capture_sked,
                  parsed_sked,
                  sked,
                  taxpayer_name="",
                  tax_period=""):
        for part_key in this_capture_sked['parts'].keys():
            stream_key = this_capture_sked['parts'][part_key]['stream_key']
            this_stream = self.output_streams[stream_key]
            part = None
            try:
                part = parsed_sked['schedule_parts'][part_key]
            except KeyError:
                continue

            capture_dict = this_capture_sked['parts'][part_key]

            row_data = {}
            row_data['form'] = sked
            row_data['source'] = part_key
            row_data['year'] = self.year
            row_data['taxpayer_name'] = taxpayer_name
            row_data['tax_period'] = tax_period

            for capture_key in capture_dict.keys():
                if capture_key == 'stream_key':
                    continue
                try:
                    val = part[capture_key]
                    csv_header = capture_dict[capture_key]['header']
                    row_data[csv_header] = val

                except KeyError:
                    try:
                        default = capture_dict[capture_key]['default']
                        csv_header = capture_dict[capture_key]['header']
                        row_data[csv_header] = default
                    except KeyError:
                        pass

            ## Composite keys: Not implemented here.

            ## We've gone through who whole part -- write it to file
            this_stream['writer'].writerow(row_data)

    def run_groups(self,
                   this_capture_sked,
                   parsed_sked,
                   sked,
                   taxpayer_name="",
                   tax_period=""):
        for group_key in this_capture_sked['groups'].keys():
            stream_key = this_capture_sked['groups'][group_key]['stream_key']
            this_stream = self.output_streams[stream_key]
            groups = None
            try:
                groups = parsed_sked['groups'][group_key]
            except KeyError:
                #print("No groups found for %s\n" % group_key)
                continue

            for group in groups:
                capture_dict = this_capture_sked['groups'][group_key]
                row_data = {}
                row_data['form'] = sked
                row_data['source'] = group_key
                row_data['year'] = self.year
                row_data['taxpayer_name'] = taxpayer_name
                row_data['tax_period'] = tax_period

                for capture_key in capture_dict.keys():
                    if capture_key == 'stream_key':
                        continue
                    try:
                        val = group[capture_key]
                        csv_header = capture_dict[capture_key]['header']
                        row_data[csv_header] = val

                    except KeyError:
                        try:
                            default = capture_dict[capture_key]['default']
                            csv_header = capture_dict[capture_key]['header']
                            row_data[csv_header] = default
                        except KeyError:
                            pass

                ## now look for "composite keys"
                composite_groups = None
                try:
                    composite_groups = capture_dict['composite']
                except KeyError:
                    pass

                # composite groups are summed up from existing vars, and need a default
                if composite_groups:
                    for composite_group_key in composite_groups.keys():
                        total = 0
                        for cg_part in composite_groups[
                                composite_group_key].keys():
                            try:
                                val = group[cg_part]
                                total += int(val)
                            except KeyError:
                                total += composite_groups[composite_group_key][
                                    cg_part]['default']
                        row_data[composite_group_key] = total

                ## We've gone through who whole group -- write it to file
                this_stream['writer'].writerow(row_data)

    def run_filing(self, filing, taxpayer_name="", tax_period=""):

        parsed_filing = self.xml_runner.run_filing(filing)
        schedule_list = parsed_filing.list_schedules()

        if (int(parsed_filing.get_version()[:4]) < 2013):
            print("Skipping pre-2013 schemas")
            return None

        for sked in self.data_capture_dict.keys():
            if sked in schedule_list:
                #print ("Running sked %s" % sked)
                parsed_skeds = parsed_filing.get_parsed_sked(sked)
                if parsed_skeds:
                    parsed_sked = parsed_skeds[0]
                else:
                    continue

                this_capture_sked = self.data_capture_dict[sked]

                ### Repeating Groups
                skip_groups = False
                try:
                    this_capture_sked['groups']
                except KeyError:
                    skip_groups = True
                if not skip_groups:
                    self.run_groups(this_capture_sked,
                                    parsed_sked,
                                    sked,
                                    taxpayer_name=taxpayer_name,
                                    tax_period=tax_period)

                ### Nonrepeating schedule parts
                skip_parts = False
                try:
                    this_capture_sked['parts']
                except KeyError:
                    skip_parts = True
                if not skip_parts:
                    self.run_parts(this_capture_sked,
                                   parsed_sked,
                                   sked,
                                   taxpayer_name=taxpayer_name,
                                   tax_period=tax_period)
예제 #14
0
def filing_990_historical(message, context):
    latest_saved_year = settings['latest_year_file']
    latest_saved_idx = settings['latest_index_in_file']
    failed_object_ids = settings['failed_object_ids']
    if latest_saved_year == 2010:
        return True
    xml_runner = XMLRunner()
    start_time = time.time()
    exit = False
    files = os.listdir('indexes')
    actions = []
    for _file in files:
        if _file != str(latest_saved_year) + '.csv':
            continue
        with open('indexes/' + _file, newline='\n') as csvfile:
            reader = csv.reader(csvfile, delimiter=',')
            next(reader)  # skip header
            for idx, row in enumerate(reader):
                if time.time() - start_time > 520:
                    exit = True
                    break
                if idx < latest_saved_idx:
                    continue
                object_id = row[-1]
                try:
                    filing = xml_runner.run_filing(object_id)
                except (RuntimeError, InvalidXMLException) as e:
                    failed_object_ids.append(object_id)
                    continue
                try:
                    schedules = filing.list_schedules()
                    if 'IRS990PF' in schedules:
                        org = org_from_990pf(filing)
                        grants_to_create = grants_from_990pf(filing)
                    elif 'IRS990EZ' in schedules:
                        org = org_from_990ez(filing)
                        grants_to_create = []
                    elif 'IRS990' in schedules:
                        org = org_from_990(filing)
                        grants_to_create = grants_from_990(filing)
                    else:
                        raise RuntimeError('No schedule available to parse.')
                except (RuntimeError, Exception) as e:
                    failed_object_ids.append(object_id)
                    continue
                actions.append({
                    '_op_type':
                    'index',
                    '_index':
                    'irs-990-filing',
                    '_id':
                    object_id,
                    '_source':
                    json.dumps({
                        'org': org,
                        'grants': grants_to_create
                    })
                })
            else:
                latest_saved_year -= 1
        if exit:
            break
    if actions:
        helpers.bulk(es, actions)
        actions = []
        logger.info('ELASTICSEARCH UPDATED')
    settings['latest_year_file'] = latest_saved_year
    settings['latest_index_in_file'] = idx
    settings['failed_object_ids'] = failed_object_ids
    ref.set(settings)
    logger.info('FIRESTORE UPDATED')
    return True
예제 #15
0
 def setup(self):
     # get an XMLRunner -- this is what actually does the parsing
     self.xml_runner = XMLRunner()
     self.accumulator = Accumulator()
예제 #16
0
class Command(BaseCommand):
    help = '''
    Enter the filings, one by one.
    Loading is done in bulk, though status on the filings is updated one at a time.
   
    '''

    def add_arguments(self, parser):
        # Positional arguments
        parser.add_argument('year', nargs=1, type=int)

    def setup(self):
        # get an XMLRunner -- this is what actually does the parsing
        self.xml_runner = XMLRunner()
        self.accumulator = Accumulator()

    def process_sked(self, sked):
        """ Enter just one schedule """
        #print("Processing schedule %s" % sked['schedule_name'])
        for part in sked['schedule_parts'].keys():
            partname = part
            partdata = sked['schedule_parts'][part]
            #print("part %s %s" % (partname, partdata))

            self.accumulator.add_model(partname, partdata)

        for groupname in sked['groups'].keys():
            for groupdata in sked['groups'][groupname]:
                #print("group %s %s" % (groupname, groupdata) )
                self.accumulator.add_model(groupname, groupdata)

    def run_filing(self, filing):
        object_id = filing.object_id
        print("run_filing %s" % object_id)

        parsed_filing = self.xml_runner.run_filing(object_id)
        if not parsed_filing:
            print(
                "Skipping filing %s(filings with pre-2013 filings are skipped)\n row details: %s"
                % (filing, metadata_row))
            return None

        schedule_list = parsed_filing.list_schedules()
        #print("sked list is %s" % schedule_list)

        result = parsed_filing.get_result()

        keyerrors = parsed_filing.get_keyerrors()
        schema_version = parsed_filing.get_version()
        ## This could be disabled if we don't care about the schema version
        ## This is one save per loaded row...
        if filing.schema_version != schema_version:
            filing.schema_version = schema_version
            filing.save()

        if keyerrors:
            # If we find keyerrors--xpaths that are missing from our spec, note it
            print("Key error %s")
            has_keyerrors = len(keyerrors) > 0
            print("keyerror: %s" % keyerrors)
            filing.error_details = str(keyerrors)
            filing.key_error_count = len(keyerrors)
            filing.is_error = has_keyerrors
            filing.save()

        if result:
            for sked in result:
                self.process_sked(sked)
        else:
            print("Filing not parsed %s " % object_id)

    def handle(self, *args, **options):

        year = int(options['year'][0])
        if year not in [2014, 2015, 2016, 2017, 2018]:
            raise RuntimeError(
                "Illegal year `%s`. Please enter a year between 2014 and 2018"
                % year)

        print("Running filings during year %s" % year)
        self.setup()

        process_count = 0
        while True:
            filings = Filing.objects.filter(submission_year=year).exclude(
                parse_complete=True)[:100]
            if not filings:
                print("Done")
                break

            object_id_list = [f.object_id for f in filings]

            # record that processing has begun
            Filing.objects.filter(object_id__in=object_id_list).update(
                parse_started=True)

            for filing in filings:
                #print("Handling id %s" % filing.object_id)
                self.run_filing(filing)
                process_count += 1
                if process_count % 1000 == 0:
                    print("Handled %s filings" % process_count)

            # commit anything that's left
            self.accumulator.commit_all()
            # record that all are complete
            Filing.objects.filter(object_id__in=object_id_list).update(
                process_time=datetime.now(), parse_complete=True)
            print("Processed a total of %s filings" % process_count)
예제 #17
0
def filing_990_historical(message, context):
    today = datetime.datetime.today()
    year = today.year
    download_current_year_index(year)
    latest_saved_year = settings['latest_year_file']
    latest_saved_idx = settings['latest_index_in_file']
    failed_object_ids = settings['failed_object_ids']
    xml_runner = XMLRunner()
    start_time = time.time()
    actions = []
    with open('/tmp/' + str(year) + '.csv', newline='\n') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        next(reader)  # skip header
        for idx, row in enumerate(reader):
            if time.time() - start_time > 520:
                break
            if idx < latest_saved_idx:
                continue
            object_id = row[-2]
            if object_id in saved_object_ids or object_id in failed_object_ids:
                continue
            try:
                filing = xml_runner.run_filing(object_id)
            except (RuntimeError, InvalidXMLException) as e:
                failed_object_ids.append(object_id)
                continue
            try:
                schedules = filing.list_schedules()
                if 'IRS990PF' in schedules:
                    org = org_from_990pf(filing)
                    grants_to_create = grants_from_990pf(filing)
                elif 'IRS990EZ' in schedules:
                    org = org_from_990ez(filing)
                    grants_to_create = []
                elif 'IRS990' in schedules:
                    org = org_from_990(filing)
                    grants_to_create = grants_from_990(filing)
                else:
                    raise RuntimeError('No schedule available to parse.')
            except (RuntimeError, Exception) as e:
                failed_object_ids.append(object_id)
                continue
            actions.append({
                '_op_type':
                'index',
                '_index':
                'irs-990-filing',
                '_id':
                object_id,
                '_source':
                json.dumps({
                    'org': org,
                    'grants': grants_to_create
                })
            })
        else:
            if today == datetime.date(day=31, month=12, year=year):
                latest_saved_year += 1
    if actions:
        helpers.bulk(es, actions)
        actions = []
        logger.info('ELASTICSEARCH UPDATED')
    settings['latest_year_file'] = latest_saved_year
    settings['latest_index_in_file'] = idx
    settings['failed_object_ids'] = failed_object_ids
    ref.set(settings)
    logger.info('FIRESTORE UPDATED')
    return True
예제 #18
0
def analyze990(filing_number):
    xml_runner = XMLRunner()
    parsed_filing = xml_runner.run_filing(filing_number)
    result = parsed_filing.get_csv_result()
    print(result)
예제 #19
0
class DownloadWorker(Thread):
    def add_arguments(self, parser):
        # Positional arguments
        parser.add_argument('year', nargs=1, type=int)

    def setup(self):
        # get an XMLRunner -- this is what actually does the parsing
        self.xml_runner = XMLRunner()
        self.accumulator = Accumulator()

    def process_sked(self, sked):
        """ Enter just one schedule """
        #print("Processing schedule %s" % sked['schedule_name'])
        for part in sked['schedule_parts'].keys():
            partname = part
            partdata = sked['schedule_parts'][part]
            #print("part %s %s" % (partname, partdata))

            self.accumulator.add_model(partname, partdata)

        for groupname in sked['groups'].keys():
            for groupdata in sked['groups'][groupname]:
                #print("group %s %s" % (groupname, groupdata) )
                self.accumulator.add_model(groupname, groupdata)

    def run_filing(self, filing):
        #        print (filing)

        object_id = filing.object_id

        parsed_filing = self.xml_runner.run_filing(object_id)
        if not parsed_filing:
            print(
                "Skipping filing %s(filings with pre-2013 filings are skipped)\n row details: %s"
                % (filing, metadata_row))
            return None

        schedule_list = parsed_filing.list_schedules()
        # print("sked list is %s" % schedule_list)

        result = parsed_filing.get_result()

        keyerrors = parsed_filing.get_keyerrors()

        if keyerrors:
            # If we find keyerrors--xpaths that are missing from our spec, note it
            print("Key error %s")
            has_keyerrors = len(keyerrors) > 0
            print("keyerror: %s" % keyerrors)
            filing.error_details = str(keyerrors)
            filing.key_error_count = len(keyerrors)
            filing.is_error = has_keyerrors
            filing.save()

        if result:
            for sked in result:
                self.process_sked(sked)
        else:
            print("Filing not parsed %s " % object_id)

    def __init__(self, queue):
        Thread.__init__(self)
        self.queue = queue

    def run(self):
        self.xml_runner = XMLRunner()
        self.accumulator = Accumulator()
        while True:
            filing = self.queue.get()
            self.run_filing(filing)
            self.queue.task_done()
        connection.close()