def process(request): if request.headers.get('x-api-key', '') != API_KEY: return 'Not found', 404 xml_runner = XMLRunner() try: filing = xml_runner.run_filing(request.args.get('aws_object_id','')) except RuntimeError as e: return "Error getting XML: {0}".format(str(e)), 400 try: if 'IRS990PF' in filing.list_schedules(): org = org_from_990pf(filing) grants_to_create = grants_from_990pf(filing) elif 'IRS990EZ' in filing.list_schedules(): org = org_from_990ez(filing) grants_to_create = [] elif 'IRS990' in filing.list_schedules(): org = org_from_990(filing) grants_to_create = grants_from_990(filing) else: raise RuntimeError('No schedule available to parse.') except RuntimeError as e: return "Error getting org: {0}".format(str(e)), 500 if org.get('ein', '') == '': return "No EIN found", 500 client = MongoClient(MONGO_URL) db = client.springsteen timestamp = timestamp_now() org['updatedAt'] = timestamp existing_org = db.organizations.find_one({'ein': org['ein']}) if existing_org == None: org['createdAt'] = timestamp result = db.organizations.insert_one(org) org_mongo_id = result.inserted_id else: org_mongo_id = existing_org['_id'] if 'lastFilingAt' not in existing_org or parse(existing_org['lastFilingAt']) < parse(org['lastFilingAt']): merged_org = {**existing_org, **org} if 'createdAt' not in merged_org or merged_org['createdAt'] == 'yo': merged_org['createdAt'] = timestamp result = db.organizations.find_one_and_update({'_id': existing_org['_id']}, {'$set': merged_org}, return_document=ReturnDocument.AFTER) for grant in grants_to_create: grant['funder'] = DBRef('organizations', org_mongo_id) grant['createdAt'] = timestamp grant['updatedAt'] = timestamp if len(grants_to_create) > 0: # Grants should not be replaced if they are already uploaded for that tax period/funder since they can be modified by other sources after initial upload if db.grants.find_one({'funderEIN': org['ein'], 'fromTaxPeriodEnding': grants_to_create[0]['fromTaxPeriodEnding']}) == None: result = db.grants.delete_many({'funderEIN': org['ein'], 'fromTaxPeriodEnding': grants_to_create[0]['fromTaxPeriodEnding']}) result = db.grants.insert_many(grants_to_create) return 'OK'
def run(self): self.xml_runner = XMLRunner() self.accumulator = Accumulator() while True: filing = self.queue.get() self.run_filing(filing) self.queue.task_done() connection.close()
def run(self): self.xml_runner = XMLRunner() self.accumulator = Accumulator() while True: filing = self.queue.get() try: self.run_filing(filing) except Exception as ex: print(ex) finally: self.queue.task_done() connection.close()
def handle(self, *args, **options): self.xml_runner = None self.standardizer = Standardizer() count = 0 submissions = XMLSubmission.objects.filter( schema_year__gte=2013, sub_date__contains='2017').values('taxpayer_name', 'tax_period', 'sub_date', 'object_id') for submission in submissions: count += 1 if count % 100 == 0: print("Processed %s filings" % count) reset_queries() # not sure this will matter, but... self.xml_runner = None # Erase this to prevent memory leaks if not self.xml_runner: # will start up faster if we don't have to reread/import csvs self.xml_runner = XMLRunner(standardizer=self.standardizer) whole_submission = XMLSubmission.objects.get( object_id=submission['object_id']) if type(whole_submission.as_json) == unicodeType: submission_json = json.loads(whole_submission.as_json) else: # Assume it's a dict? # We don't have any "working" installations that return json as json submission_json = whole_submission.as_json filingobj = Filing(submission['object_id'], json=submission_json) parsedFiling = self.xml_runner.run_from_filing_obj( filingobj, verbose=False, ) result = parsedFiling.get_result() keyerrors = parsedFiling.get_keyerrors() has_keyerrors = len(keyerrors) > 0 try: ProcessedFiling.objects.get(object_id=submission['object_id']) except ProcessedFiling.DoesNotExist: ProcessedFiling.objects.create( ein=whole_submission.ein, object_id=whole_submission.object_id, processed_json=result, keyerrors=keyerrors, has_keyerrors=has_keyerrors, submission=whole_submission)
def handle(self, *args, **options): self.xml_runner = XMLRunner() self.accumulator = Accumulator() count = 0 while True: xml_batch = XMLSubmission.objects.filter( year__in=[2014, 2015, 2016, 2017]).exclude( json_set=True)[:BATCHSIZE] #xml_batch = XMLSubmission.objects.filter(object_id__in=ids).exclude(json_set=True)[:BATCHSIZE] #xml_batch = XMLSubmission.objects.filter(object_id__in=['201540859349100204',])[:BATCHSIZE] #xml_batch = XMLSubmission.objects.filter(object_id__in=test2016ids).exclude(json_set=True)[:BATCHSIZE] #xml_batch = XMLSubmission.objects.filter(sub_date__regex=r'^8.+2017.*').exclude(json_set=True)[:BATCHSIZE] #xml_batch = XMLSubmission.objects.filter(object_id__in=test2016ids).exclude(json_set=True)[:BATCHSIZE] #xml_batch = XMLSubmission.objects.filter(object_id__in=['201523179349302022',]).exclude(json_set=True) count += 1 print count if len(xml_batch) == 0: break self.process_batch(xml_batch) # for testing if count > LOOP_MAX: break
def fetch_yr_ind(oid_srch_lst): # Should we use IRSx or manual concordance? Setup IRSx if using it # Requires all object IDs in the file to be from the same year irsx_flag = True if int(oid_srch_lst[0][:4]) >= 2015 else False xml_runner = XMLRunner() if irsx_flag else None yr_ind_new = pd.DataFrame() # Iterate through Object IDs and update regularly start_time = time.time() counter = 0 for oid in oid_srch_lst: yr_ind_new = yr_ind_new.append( fetch_ind_row(irsx_flag, xml_runner, oid)) if counter % upd_intvl == 0: elapsed = time.time() - start_time logging.info( "Read {} forms from current year in {:,.1f} seconds.".format( counter, elapsed)) counter += 1 yr_ind_new['990_SRC'] = "AWS FILE DIR" return yr_ind_new
def __init__(self, object_id, obj_tbl_field_map=None): self.object_id = object_id self.xml_runner = XMLRunner() self.obj_tbl_field_map = obj_tbl_field_map self.header_dict = self.process_header_fields() self.balance_dict = self.process_balance_fields() self.people = self.process_compensation_fields() self.failures = { 'comp': True if self.people is None else False, 'balance': True if self.balance_dict is None else False, 'header': True if self.header_dict is None else False }
def fetch_yr_ind( oid_srch_lst ): # Should we use IRSx or manual concordance? Setup IRSx if using it # Requires all object IDs in the file to be from the same year irsx_flag = True if int( oid_srch_lst[0][:4] ) >= 2015 else False xml_runner = XMLRunner() if irsx_flag else None yr_ind_new = pd.DataFrame() # Iterate through Object IDs for oid in oid_srch_lst: yr_ind_new = yr_ind_new.append( fetch_ind_row( irsx_flag, xml_runner, oid ) ) yr_ind_new['990_SRC'] = "AWS FILE DIR" return yr_ind_new
def handle(self, *args, **options): self.xml_runner = None #self.fix_connection() self.standardizer = Standardizer() count = 0 headers = [ "taxpayer_name", "ein", "tax_period", "sub_date", "object_id", "name", "title", "org_comp", "related_comp", "other_cmp", "form", "source" ] outfile = open("dumptest.csv", 'wb') dw = csv.DictWriter(outfile, fieldnames=headers, extrasaction='ignore') dw.writeheader() submissions = XMLSubmission.objects.filter( schema_year__gte=2013, sub_date__contains='2017').values('taxpayer_name', 'tax_period', 'sub_date', 'object_id') #submissions = XMLSubmission.objects.filter(object_id='201513209349102976').values('taxpayer_name', 'tax_period', 'sub_date', 'object_id') #submissions = XMLSubmission.objects.filter(return_type='990PF').values('taxpayer_name', 'tax_period', 'sub_date', 'object_id') for submission in submissions: count += 1 if count % 100 == 0: print("Processed %s filings" % count) reset_queries() # not sure this will matter, but... self.xml_runner = None # Erase this to prevent memory leaks if not self.xml_runner: self.xml_runner = XMLRunner( standardizer=self.standardizer ) # will start up faster if we don't have to reread/import csvs whole_submission = XMLSubmission.objects.get( object_id=submission['object_id']) assert whole_submission.json_set # There's a bug that makes json objects get returned as unicode instead of as dicts # similar to this one https://code.djangoproject.com/ticket/27675 # though django-jsonfield isn't used in this object # See to register_json, though that doesn't work in this context # http://initd.org/psycopg/docs/extras.html if type(whole_submission.as_json) == unicodeType: submission_json = json.loads(whole_submission.as_json) else: # Assume it's a dict? We haven't seen this yet. submission_json = whole_submission.as_json filingobj = Filing(submission['object_id'], json=submission_json) #print("\n\nObject id %s\n" % submission['object_id']) #print submission_json processedFiling = self.xml_runner.run_from_filing_obj( filingobj, verbose=False, ) #print ("\n\nProcessed filing is %s" % processedFiling.get_result()) filing_info = { 'taxpayer_name': submission['taxpayer_name'], 'tax_period': submission['tax_period'], 'sub_date': submission['sub_date'] } schedule_list = processedFiling.list_schedules() result = processedFiling.get_result() keyerrors = processedFiling.get_keyerrors() if keyerrors: print("\n\n\n***keyerrors\n\n%s" % keyerrors) sked990_list = processedFiling.get_parsed_sked('IRS990') sked990EZ_list = processedFiling.get_parsed_sked('IRS990EZ') sked990PF_list = processedFiling.get_parsed_sked('IRS990PF') sked990J_list = processedFiling.get_parsed_sked('IRS990ScheduleJ') if sked990_list: #print("\n\t990") sked990 = sked990_list[0] assert sked990['schedule_name'] == 'IRS990' group_name = "Frm990PrtVIISctnA" try: employee_list = sked990['groups'][group_name] except KeyError: employee_list = [] for employee in employee_list: #print "\n\n" #print employee this_employee = { 'ein': employee['ein'], 'object_id': employee['object_id'], 'name': employee.get('PrsnNm'), 'title': employee.get('TtlTxt'), 'org_comp': employee.get('RprtblCmpFrmOrgAmt', 0), 'related_comp': employee.get('RprtblCmpFrmRltdOrgAmt', 0), 'other_cmp': employee.get('OthrCmpnstnAmt', 0), 'highest_ind': employee.get('HghstCmpnstdEmplyInd'), 'form': 'IRS990', 'source': 'Frm990PrtVIISctnA' } this_employee.update(filing_info) #print "\n" #print this_employee dw.writerow(this_employee) if sked990EZ_list: sked990EZ = sked990EZ_list[0] #print("\n\t990EZ %s" % sked990EZ['schedule_name']) assert sked990EZ['schedule_name'] == 'IRS990EZ' group_name = "EZOffcrDrctrTrstEmpl" try: employee_list = sked990EZ['groups'][group_name] except KeyError: employee_list = [] for employee in employee_list: #print employee this_employee = { 'ein': employee['ein'], 'object_id': employee['object_id'], 'name': employee.get('PrsnNm', ''), 'title': employee.get('TtlTxt', ''), 'org_comp': employee.get('CmpnstnAmt', 0), # 'related_comp': NA #'other_cmp': EmplyBnftsAmt + ExpnsAccntAmt ? 'form': 'IRS990EZ', 'source': 'EZOffcrDrctrTrstEmpl' } this_employee.update(filing_info) #print this_employee dw.writerow(this_employee) ## group_name = "EZCmpnstnHghstPdEmpl" # This is very rare try: employee_list = sked990EZ['groups'][group_name] except KeyError: employee_list = [] for employee in employee_list: this_employee = { 'ein': employee['ein'], 'object_id': employee['object_id'], 'name': employee.get('PrsnNm'), 'title': employee.get('TtlTxt'), 'org_comp': employee.get('CmpnstnAmt'), # 'related_comp': NA #'other_cmp': EmplyBnftsAmt + ExpnsAccntAmt ? 'form': 'IRS990EZ', 'source': 'EZCmpnstnHghstPdEmpl' } this_employee.update(filing_info) print "\nEZ" print employee print this_employee dw.writerow(this_employee) if sked990PF_list: sked990PF = sked990PF_list[0] #print("\n\t990PF %s" % sked990PF['schedule_name']) assert sked990PF['schedule_name'] == 'IRS990PF' group_name = "PFOffcrDrTrstKyEmpl" employee_list = [] try: employee_list = sked990PF['groups'][group_name] except KeyError: pass for employee in employee_list: #print "\n\n" #print employee this_employee = { 'ein': employee['ein'], 'object_id': employee['object_id'], 'name': employee.get('OffcrDrTrstKyEmpl_PrsnNm'), 'title': employee.get('OffcrDrTrstKyEmpl_TtlTxt'), 'org_comp': employee.get('OffcrDrTrstKyEmpl_CmpnstnAmt'), # 'related_comp': NA #'other_cmp': OffcrDrTrstKyEmpl_EmplyBnftPrgrmAmt + OffcrDrTrstKyEmpl_ExpnsAccntOthrAllwncAmt ? 'form': 'IRS990PF', 'source': 'PFOffcrDrTrstKyEmpl' } this_employee.update(filing_info) #print "\n" #print this_employee dw.writerow(this_employee) group_name = "PFCmpnstnHghstPdEmpl" # also rare employee_list = [] try: employee_list = sked990PF['groups'][group_name] except KeyError: pass for employee in employee_list: #print employee this_employee = { 'ein': employee['ein'], 'object_id': employee['object_id'], 'name': employee.get('CmpnstnHghstPdEmpl_PrsnNm'), 'title': employee.get('CmpnstnHghstPdEmpl_TtlTxt'), 'org_comp': employee.get('CmpnstnHghstPdEmpl_CmpnstnAmt'), # 'related_comp': NA #'other_cmp': CmpnstnHghstPdEmpl_EmplyBnftsAmt + CmpnstnHghstPdEmpl_ExpnsAccntAmt ? 'form': 'IRS990PF', 'source': 'PFCmpnstnHghstPdEmpl' } this_employee.update(filing_info) #print "\n" #print this_employee dw.writerow(this_employee) if sked990J_list: sked990J = sked990J_list[0] #print("\n\t990J %s" % sked990J['schedule_name']) assert sked990J['schedule_name'] == 'IRS990ScheduleJ' group_name = "SkdJRltdOrgOffcrTrstKyEmpl" employee_list = [] try: employee_list = sked990J['groups'][group_name] except KeyError: pass for employee in employee_list: #print "\n\n sked J" #print employee this_employee = { 'ein': employee['ein'], 'object_id': employee['object_id'], 'name': employee.get('PrsnNm'), 'bus_line_1': employee.get('BsnssNmLn1Txt'), 'title': employee.get('TtlTxt'), 'org_comp': employee.get('TtlCmpnstnFlngOrgAmt'), 'related_comp': employee.get('TtlCmpnstnRltdOrgsAmt'), #'other_cmp': OffcrDrTrstKyEmpl_EmplyBnftPrgrmAmt + OffcrDrTrstKyEmpl_ExpnsAccntOthrAllwncAmt ? 'form': 'IRS990ScheduleJ', 'source': 'SkdJRltdOrgOffcrTrstKyEmpl' } this_employee.update(filing_info) #print "\n" #print this_employee dw.writerow(this_employee) print("Total of %s processed" % count)
def federal_irs_ingest_get_990s(message, context): year = datetime.datetime.today().year # settings pulled from a database ref = db.collection('federal').document('irs').collection('990s').document(str(year)) settings = ref.get().to_dict() if settings is not None: latest_saved_idx = settings['idx'] else: latest_saved_idx = 0 # prep load xml_runner = XMLRunner() start_time = time.time() bucket = client.get_bucket(gcp_project_id) blob = bucket.get_blob('downloads/federal/irs/index_' + str(year) + '.csv') blob = blob.download_as_string().decode('utf-8') blob = StringIO(blob) # load by looping through all the rows in the index actions = [] failed_object_ids = [] reader = csv.reader(blob, delimiter=',') next(reader) # skip header for idx, row in enumerate(reader): if time.time() - start_time > 520: break # skip previously indexed objectss if idx < latest_saved_idx: continue # process the object id object_id = row[8] if int(object_id[:4]) < 2014: # can't process these continue # process the submission date sub_date = row[4] try: sub_date = datetime.datetime.strptime(sub_date, '%m/%d/%Y %I:%M:%S %p') except: try: sub_date = datetime.datetime.strptime(sub_date, '%m/%d/%Y') except: raise sub_date = pytz.timezone('US/Eastern').localize(sub_date) sub_date = sub_date.strftime("%Y-%m-%dT%H:%M:%S%z") # grab the filing try: filing = xml_runner.run_filing(object_id) schedules = filing.get_result() except (RuntimeError, InvalidXMLException) as e: logger.error(object_id, str(e)) failed_object_ids.append(object_id) continue if schedules is not None: xml = utilities.get_xml_parts(schedules) xml = utilities.clean_xml(xml) if 'IRS990EZ' in xml: index = '990ez' elif 'IRS990PF' in xml: index = '990pf' else: index = '990' actions.append({ '_op_type': 'index', '_index': 'federal_irs_' + index, '_id': object_id, '_source': { 'row': { 'return_id': str(row[0]), 'filing_type': row[1], 'ein': str(row[2]), 'tax_period': row[3], 'sub_date': sub_date, 'taxpayer_name': row[5], 'return_type': str(row[6]), 'dln': str(row[7]), 'object_id': object_id }, 'obj': xml, 'context': { 'last_indexed': datetime.datetime.now(datetime.timezone.utc) } } }) if len(actions) >= 1000: helpers.bulk(es, actions) logger.info('ELASTICSEARCH UPDATED' + ' - ' + str(len(actions)) + ' docs') actions = [] # index all docs into elasticsearch if actions: helpers.bulk(es, actions) logger.info('ELASTICSEARCH UPDATED' + ' - ' + str(len(actions)) + ' docs') # update Firestore update = { "idx": idx, "last_updated": datetime.datetime.now(datetime.timezone.utc) } if len(failed_object_ids) > 0: update['failed_object_ids'] = firestore.ArrayUnion(failed_object_ids) ref.set(update, merge=True) num_remaining_rows = len(list(reader)) logger.info('FIRESTORE UPDATED - completed: ' + str(idx) + ', remaining: ' + str(num_remaining_rows)) return num_remaining_rows
import csv import pandas as pd import requests.exceptions import time from irsx.xmlrunner import XMLRunner from irsx.filing import InvalidXMLException #Works for schemas: 2013v3.0 through part of 2016. Assuming that there's no way a FY2015 filing could use TY2016 schema. timestr = time.strftime("%Y-%m-%d-%H-%M") xml_runner = XMLRunner(documentation=True, csv_format=True) df = pd.read_csv("files/new_ids.csv") object_list = list(df["object_id"]) fieldnames = [ "schema", #May need to join with the BMF to get the foundation type. "object_id", "/ReturnHeader/ReturnTs", "/ReturnHeader/Filer/EIN", #"/ReturnHeader/Filer/Name/BusinessNameLine1", This field has had its name changed multiple times. Just use the name in the BMF. "/ReturnHeader/TaxPeriodEndDt", "/ReturnHeader/ReturnTypeCd", "/ReturnHeader/TaxYr", "/IRS990PF/AmendedReturnInd", "/IRS990PF/FinalReturnInd", "/IRS990PF/StatementsRegardingActyGrp/PrivateOperatingFoundationInd", "/IRS990PF/FMVAssetsEOYAmt", "/IRS990PF/MethodOfAccountingCashInd", "/IRS990PF/AnalysisOfRevenueAndExpenses/OthEmplSlrsWgsRevAndExpnssAmt", "/IRS990PF/AnalysisOfRevenueAndExpenses/OthEmplSlrsWgsDsbrsChrtblAmt", "/IRS990PF/AnalysisOfRevenueAndExpenses/TotOprExpensesRevAndExpnssAmt",
def __init__(self, output_streams, data_capture_dict, year): self.year = year self.output_streams = output_streams self.data_capture_dict = data_capture_dict self.xml_runner = XMLRunner() self._init_streams()
class StreamExtractor(object): """Write filings to csv, specified in config.py""" def __init__(self, output_streams, data_capture_dict, year): self.year = year self.output_streams = output_streams self.data_capture_dict = data_capture_dict self.xml_runner = XMLRunner() self._init_streams() def _init_streams(self): for stream_key in self.output_streams.keys(): this_stream = self.output_streams[stream_key] filename = this_stream['filename'] + str(self.year) + ".csv" print("Initializing output stream %s" % filename) outfile = open(filename, 'wb') dw = csv.DictWriter(outfile, this_stream['headers'], extrasaction='ignore') dw.writeheader() self.output_streams[stream_key]['writer'] = dw def run_parts(self, this_capture_sked, parsed_sked, sked, taxpayer_name="", tax_period=""): for part_key in this_capture_sked['parts'].keys(): stream_key = this_capture_sked['parts'][part_key]['stream_key'] this_stream = self.output_streams[stream_key] part = None try: part = parsed_sked['schedule_parts'][part_key] except KeyError: continue capture_dict = this_capture_sked['parts'][part_key] row_data = {} row_data['form'] = sked row_data['source'] = part_key row_data['year'] = self.year row_data['taxpayer_name'] = taxpayer_name row_data['tax_period'] = tax_period for capture_key in capture_dict.keys(): if capture_key == 'stream_key': continue try: val = part[capture_key] csv_header = capture_dict[capture_key]['header'] row_data[csv_header] = val except KeyError: try: default = capture_dict[capture_key]['default'] csv_header = capture_dict[capture_key]['header'] row_data[csv_header] = default except KeyError: pass ## Composite keys: Not implemented here. ## We've gone through who whole part -- write it to file this_stream['writer'].writerow(row_data) def run_groups(self, this_capture_sked, parsed_sked, sked, taxpayer_name="", tax_period=""): for group_key in this_capture_sked['groups'].keys(): stream_key = this_capture_sked['groups'][group_key]['stream_key'] this_stream = self.output_streams[stream_key] groups = None try: groups = parsed_sked['groups'][group_key] except KeyError: #print("No groups found for %s\n" % group_key) continue for group in groups: capture_dict = this_capture_sked['groups'][group_key] row_data = {} row_data['form'] = sked row_data['source'] = group_key row_data['year'] = self.year row_data['taxpayer_name'] = taxpayer_name row_data['tax_period'] = tax_period for capture_key in capture_dict.keys(): if capture_key == 'stream_key': continue try: val = group[capture_key] csv_header = capture_dict[capture_key]['header'] row_data[csv_header] = val except KeyError: try: default = capture_dict[capture_key]['default'] csv_header = capture_dict[capture_key]['header'] row_data[csv_header] = default except KeyError: pass ## now look for "composite keys" composite_groups = None try: composite_groups = capture_dict['composite'] except KeyError: pass # composite groups are summed up from existing vars, and need a default if composite_groups: for composite_group_key in composite_groups.keys(): total = 0 for cg_part in composite_groups[ composite_group_key].keys(): try: val = group[cg_part] total += int(val) except KeyError: total += composite_groups[composite_group_key][ cg_part]['default'] row_data[composite_group_key] = total ## We've gone through who whole group -- write it to file this_stream['writer'].writerow(row_data) def run_filing(self, filing, taxpayer_name="", tax_period=""): parsed_filing = self.xml_runner.run_filing(filing) schedule_list = parsed_filing.list_schedules() if (int(parsed_filing.get_version()[:4]) < 2013): print("Skipping pre-2013 schemas") return None for sked in self.data_capture_dict.keys(): if sked in schedule_list: #print ("Running sked %s" % sked) parsed_skeds = parsed_filing.get_parsed_sked(sked) if parsed_skeds: parsed_sked = parsed_skeds[0] else: continue this_capture_sked = self.data_capture_dict[sked] ### Repeating Groups skip_groups = False try: this_capture_sked['groups'] except KeyError: skip_groups = True if not skip_groups: self.run_groups(this_capture_sked, parsed_sked, sked, taxpayer_name=taxpayer_name, tax_period=tax_period) ### Nonrepeating schedule parts skip_parts = False try: this_capture_sked['parts'] except KeyError: skip_parts = True if not skip_parts: self.run_parts(this_capture_sked, parsed_sked, sked, taxpayer_name=taxpayer_name, tax_period=tax_period)
def filing_990_historical(message, context): latest_saved_year = settings['latest_year_file'] latest_saved_idx = settings['latest_index_in_file'] failed_object_ids = settings['failed_object_ids'] if latest_saved_year == 2010: return True xml_runner = XMLRunner() start_time = time.time() exit = False files = os.listdir('indexes') actions = [] for _file in files: if _file != str(latest_saved_year) + '.csv': continue with open('indexes/' + _file, newline='\n') as csvfile: reader = csv.reader(csvfile, delimiter=',') next(reader) # skip header for idx, row in enumerate(reader): if time.time() - start_time > 520: exit = True break if idx < latest_saved_idx: continue object_id = row[-1] try: filing = xml_runner.run_filing(object_id) except (RuntimeError, InvalidXMLException) as e: failed_object_ids.append(object_id) continue try: schedules = filing.list_schedules() if 'IRS990PF' in schedules: org = org_from_990pf(filing) grants_to_create = grants_from_990pf(filing) elif 'IRS990EZ' in schedules: org = org_from_990ez(filing) grants_to_create = [] elif 'IRS990' in schedules: org = org_from_990(filing) grants_to_create = grants_from_990(filing) else: raise RuntimeError('No schedule available to parse.') except (RuntimeError, Exception) as e: failed_object_ids.append(object_id) continue actions.append({ '_op_type': 'index', '_index': 'irs-990-filing', '_id': object_id, '_source': json.dumps({ 'org': org, 'grants': grants_to_create }) }) else: latest_saved_year -= 1 if exit: break if actions: helpers.bulk(es, actions) actions = [] logger.info('ELASTICSEARCH UPDATED') settings['latest_year_file'] = latest_saved_year settings['latest_index_in_file'] = idx settings['failed_object_ids'] = failed_object_ids ref.set(settings) logger.info('FIRESTORE UPDATED') return True
def setup(self): # get an XMLRunner -- this is what actually does the parsing self.xml_runner = XMLRunner() self.accumulator = Accumulator()
class Command(BaseCommand): help = ''' Enter the filings, one by one. Loading is done in bulk, though status on the filings is updated one at a time. ''' def add_arguments(self, parser): # Positional arguments parser.add_argument('year', nargs=1, type=int) def setup(self): # get an XMLRunner -- this is what actually does the parsing self.xml_runner = XMLRunner() self.accumulator = Accumulator() def process_sked(self, sked): """ Enter just one schedule """ #print("Processing schedule %s" % sked['schedule_name']) for part in sked['schedule_parts'].keys(): partname = part partdata = sked['schedule_parts'][part] #print("part %s %s" % (partname, partdata)) self.accumulator.add_model(partname, partdata) for groupname in sked['groups'].keys(): for groupdata in sked['groups'][groupname]: #print("group %s %s" % (groupname, groupdata) ) self.accumulator.add_model(groupname, groupdata) def run_filing(self, filing): object_id = filing.object_id print("run_filing %s" % object_id) parsed_filing = self.xml_runner.run_filing(object_id) if not parsed_filing: print( "Skipping filing %s(filings with pre-2013 filings are skipped)\n row details: %s" % (filing, metadata_row)) return None schedule_list = parsed_filing.list_schedules() #print("sked list is %s" % schedule_list) result = parsed_filing.get_result() keyerrors = parsed_filing.get_keyerrors() schema_version = parsed_filing.get_version() ## This could be disabled if we don't care about the schema version ## This is one save per loaded row... if filing.schema_version != schema_version: filing.schema_version = schema_version filing.save() if keyerrors: # If we find keyerrors--xpaths that are missing from our spec, note it print("Key error %s") has_keyerrors = len(keyerrors) > 0 print("keyerror: %s" % keyerrors) filing.error_details = str(keyerrors) filing.key_error_count = len(keyerrors) filing.is_error = has_keyerrors filing.save() if result: for sked in result: self.process_sked(sked) else: print("Filing not parsed %s " % object_id) def handle(self, *args, **options): year = int(options['year'][0]) if year not in [2014, 2015, 2016, 2017, 2018]: raise RuntimeError( "Illegal year `%s`. Please enter a year between 2014 and 2018" % year) print("Running filings during year %s" % year) self.setup() process_count = 0 while True: filings = Filing.objects.filter(submission_year=year).exclude( parse_complete=True)[:100] if not filings: print("Done") break object_id_list = [f.object_id for f in filings] # record that processing has begun Filing.objects.filter(object_id__in=object_id_list).update( parse_started=True) for filing in filings: #print("Handling id %s" % filing.object_id) self.run_filing(filing) process_count += 1 if process_count % 1000 == 0: print("Handled %s filings" % process_count) # commit anything that's left self.accumulator.commit_all() # record that all are complete Filing.objects.filter(object_id__in=object_id_list).update( process_time=datetime.now(), parse_complete=True) print("Processed a total of %s filings" % process_count)
def filing_990_historical(message, context): today = datetime.datetime.today() year = today.year download_current_year_index(year) latest_saved_year = settings['latest_year_file'] latest_saved_idx = settings['latest_index_in_file'] failed_object_ids = settings['failed_object_ids'] xml_runner = XMLRunner() start_time = time.time() actions = [] with open('/tmp/' + str(year) + '.csv', newline='\n') as csvfile: reader = csv.reader(csvfile, delimiter=',') next(reader) # skip header for idx, row in enumerate(reader): if time.time() - start_time > 520: break if idx < latest_saved_idx: continue object_id = row[-2] if object_id in saved_object_ids or object_id in failed_object_ids: continue try: filing = xml_runner.run_filing(object_id) except (RuntimeError, InvalidXMLException) as e: failed_object_ids.append(object_id) continue try: schedules = filing.list_schedules() if 'IRS990PF' in schedules: org = org_from_990pf(filing) grants_to_create = grants_from_990pf(filing) elif 'IRS990EZ' in schedules: org = org_from_990ez(filing) grants_to_create = [] elif 'IRS990' in schedules: org = org_from_990(filing) grants_to_create = grants_from_990(filing) else: raise RuntimeError('No schedule available to parse.') except (RuntimeError, Exception) as e: failed_object_ids.append(object_id) continue actions.append({ '_op_type': 'index', '_index': 'irs-990-filing', '_id': object_id, '_source': json.dumps({ 'org': org, 'grants': grants_to_create }) }) else: if today == datetime.date(day=31, month=12, year=year): latest_saved_year += 1 if actions: helpers.bulk(es, actions) actions = [] logger.info('ELASTICSEARCH UPDATED') settings['latest_year_file'] = latest_saved_year settings['latest_index_in_file'] = idx settings['failed_object_ids'] = failed_object_ids ref.set(settings) logger.info('FIRESTORE UPDATED') return True
def analyze990(filing_number): xml_runner = XMLRunner() parsed_filing = xml_runner.run_filing(filing_number) result = parsed_filing.get_csv_result() print(result)
class DownloadWorker(Thread): def add_arguments(self, parser): # Positional arguments parser.add_argument('year', nargs=1, type=int) def setup(self): # get an XMLRunner -- this is what actually does the parsing self.xml_runner = XMLRunner() self.accumulator = Accumulator() def process_sked(self, sked): """ Enter just one schedule """ #print("Processing schedule %s" % sked['schedule_name']) for part in sked['schedule_parts'].keys(): partname = part partdata = sked['schedule_parts'][part] #print("part %s %s" % (partname, partdata)) self.accumulator.add_model(partname, partdata) for groupname in sked['groups'].keys(): for groupdata in sked['groups'][groupname]: #print("group %s %s" % (groupname, groupdata) ) self.accumulator.add_model(groupname, groupdata) def run_filing(self, filing): # print (filing) object_id = filing.object_id parsed_filing = self.xml_runner.run_filing(object_id) if not parsed_filing: print( "Skipping filing %s(filings with pre-2013 filings are skipped)\n row details: %s" % (filing, metadata_row)) return None schedule_list = parsed_filing.list_schedules() # print("sked list is %s" % schedule_list) result = parsed_filing.get_result() keyerrors = parsed_filing.get_keyerrors() if keyerrors: # If we find keyerrors--xpaths that are missing from our spec, note it print("Key error %s") has_keyerrors = len(keyerrors) > 0 print("keyerror: %s" % keyerrors) filing.error_details = str(keyerrors) filing.key_error_count = len(keyerrors) filing.is_error = has_keyerrors filing.save() if result: for sked in result: self.process_sked(sked) else: print("Filing not parsed %s " % object_id) def __init__(self, queue): Thread.__init__(self) self.queue = queue def run(self): self.xml_runner = XMLRunner() self.accumulator = Accumulator() while True: filing = self.queue.get() self.run_filing(filing) self.queue.task_done() connection.close()