예제 #1
0
파일: main.py 프로젝트: FD253/data
def process(request):
    if request.headers.get('x-api-key', '') != API_KEY:
        return 'Not found', 404

    xml_runner = XMLRunner()
    try:
        filing = xml_runner.run_filing(request.args.get('aws_object_id',''))
    except RuntimeError as e:
        return "Error getting XML: {0}".format(str(e)), 400

    try:
        if 'IRS990PF' in filing.list_schedules():
            org = org_from_990pf(filing)
            grants_to_create = grants_from_990pf(filing)
        elif 'IRS990EZ' in filing.list_schedules():
            org = org_from_990ez(filing)
            grants_to_create = []
        elif 'IRS990' in filing.list_schedules():
            org = org_from_990(filing)
            grants_to_create = grants_from_990(filing)
        else:
            raise RuntimeError('No schedule available to parse.')
    except RuntimeError as e:
        return "Error getting org: {0}".format(str(e)), 500

    if org.get('ein', '') == '':
        return "No EIN found", 500

    client = MongoClient(MONGO_URL)
    db = client.springsteen

    timestamp = timestamp_now()
    org['updatedAt'] = timestamp

    existing_org = db.organizations.find_one({'ein': org['ein']})
    if existing_org == None:
        org['createdAt'] = timestamp
        result = db.organizations.insert_one(org)
        org_mongo_id = result.inserted_id
    else:
        org_mongo_id = existing_org['_id']
        if 'lastFilingAt' not in existing_org or parse(existing_org['lastFilingAt']) < parse(org['lastFilingAt']):
            merged_org = {**existing_org, **org}
            if 'createdAt' not in merged_org or merged_org['createdAt'] == 'yo':
                merged_org['createdAt'] = timestamp
            result = db.organizations.find_one_and_update({'_id': existing_org['_id']}, {'$set': merged_org}, return_document=ReturnDocument.AFTER)

    for grant in grants_to_create:
        grant['funder'] = DBRef('organizations', org_mongo_id)
        grant['createdAt'] = timestamp
        grant['updatedAt'] = timestamp

    if len(grants_to_create) > 0:
        # Grants should not be replaced if they are already uploaded for that tax period/funder since they can be modified by other sources after initial upload
        if db.grants.find_one({'funderEIN': org['ein'], 'fromTaxPeriodEnding': grants_to_create[0]['fromTaxPeriodEnding']}) == None:
            result = db.grants.delete_many({'funderEIN': org['ein'], 'fromTaxPeriodEnding': grants_to_create[0]['fromTaxPeriodEnding']})
            result = db.grants.insert_many(grants_to_create)

    return 'OK'
예제 #2
0
class StreamExtractor(object):
    """Write filings to csv, specified in config.py"""
    def __init__(self, output_streams, data_capture_dict, year):
        self.year = year
        self.output_streams = output_streams
        self.data_capture_dict = data_capture_dict
        self.xml_runner = XMLRunner()
        self._init_streams()

    def _init_streams(self):
        for stream_key in self.output_streams.keys():
            this_stream = self.output_streams[stream_key]
            filename = this_stream['filename'] + str(self.year) + ".csv"
            print("Initializing output stream %s" % filename)
            outfile = open(filename, 'wb')
            dw = csv.DictWriter(outfile,
                                this_stream['headers'],
                                extrasaction='ignore')
            dw.writeheader()
            self.output_streams[stream_key]['writer'] = dw

    def run_parts(self,
                  this_capture_sked,
                  parsed_sked,
                  sked,
                  taxpayer_name="",
                  tax_period=""):
        for part_key in this_capture_sked['parts'].keys():
            stream_key = this_capture_sked['parts'][part_key]['stream_key']
            this_stream = self.output_streams[stream_key]
            part = None
            try:
                part = parsed_sked['schedule_parts'][part_key]
            except KeyError:
                continue

            capture_dict = this_capture_sked['parts'][part_key]

            row_data = {}
            row_data['form'] = sked
            row_data['source'] = part_key
            row_data['year'] = self.year
            row_data['taxpayer_name'] = taxpayer_name
            row_data['tax_period'] = tax_period

            for capture_key in capture_dict.keys():
                if capture_key == 'stream_key':
                    continue
                try:
                    val = part[capture_key]
                    csv_header = capture_dict[capture_key]['header']
                    row_data[csv_header] = val

                except KeyError:
                    try:
                        default = capture_dict[capture_key]['default']
                        csv_header = capture_dict[capture_key]['header']
                        row_data[csv_header] = default
                    except KeyError:
                        pass

            ## Composite keys: Not implemented here.

            ## We've gone through who whole part -- write it to file
            this_stream['writer'].writerow(row_data)

    def run_groups(self,
                   this_capture_sked,
                   parsed_sked,
                   sked,
                   taxpayer_name="",
                   tax_period=""):
        for group_key in this_capture_sked['groups'].keys():
            stream_key = this_capture_sked['groups'][group_key]['stream_key']
            this_stream = self.output_streams[stream_key]
            groups = None
            try:
                groups = parsed_sked['groups'][group_key]
            except KeyError:
                #print("No groups found for %s\n" % group_key)
                continue

            for group in groups:
                capture_dict = this_capture_sked['groups'][group_key]
                row_data = {}
                row_data['form'] = sked
                row_data['source'] = group_key
                row_data['year'] = self.year
                row_data['taxpayer_name'] = taxpayer_name
                row_data['tax_period'] = tax_period

                for capture_key in capture_dict.keys():
                    if capture_key == 'stream_key':
                        continue
                    try:
                        val = group[capture_key]
                        csv_header = capture_dict[capture_key]['header']
                        row_data[csv_header] = val

                    except KeyError:
                        try:
                            default = capture_dict[capture_key]['default']
                            csv_header = capture_dict[capture_key]['header']
                            row_data[csv_header] = default
                        except KeyError:
                            pass

                ## now look for "composite keys"
                composite_groups = None
                try:
                    composite_groups = capture_dict['composite']
                except KeyError:
                    pass

                # composite groups are summed up from existing vars, and need a default
                if composite_groups:
                    for composite_group_key in composite_groups.keys():
                        total = 0
                        for cg_part in composite_groups[
                                composite_group_key].keys():
                            try:
                                val = group[cg_part]
                                total += int(val)
                            except KeyError:
                                total += composite_groups[composite_group_key][
                                    cg_part]['default']
                        row_data[composite_group_key] = total

                ## We've gone through who whole group -- write it to file
                this_stream['writer'].writerow(row_data)

    def run_filing(self, filing, taxpayer_name="", tax_period=""):

        parsed_filing = self.xml_runner.run_filing(filing)
        schedule_list = parsed_filing.list_schedules()

        if (int(parsed_filing.get_version()[:4]) < 2013):
            print("Skipping pre-2013 schemas")
            return None

        for sked in self.data_capture_dict.keys():
            if sked in schedule_list:
                #print ("Running sked %s" % sked)
                parsed_skeds = parsed_filing.get_parsed_sked(sked)
                if parsed_skeds:
                    parsed_sked = parsed_skeds[0]
                else:
                    continue

                this_capture_sked = self.data_capture_dict[sked]

                ### Repeating Groups
                skip_groups = False
                try:
                    this_capture_sked['groups']
                except KeyError:
                    skip_groups = True
                if not skip_groups:
                    self.run_groups(this_capture_sked,
                                    parsed_sked,
                                    sked,
                                    taxpayer_name=taxpayer_name,
                                    tax_period=tax_period)

                ### Nonrepeating schedule parts
                skip_parts = False
                try:
                    this_capture_sked['parts']
                except KeyError:
                    skip_parts = True
                if not skip_parts:
                    self.run_parts(this_capture_sked,
                                   parsed_sked,
                                   sked,
                                   taxpayer_name=taxpayer_name,
                                   tax_period=tax_period)
예제 #3
0
def federal_irs_ingest_get_990s(message, context):

    year = datetime.datetime.today().year

    # settings pulled from a database
    ref = db.collection('federal').document('irs').collection('990s').document(str(year))
    settings = ref.get().to_dict()
    if settings is not None:
        latest_saved_idx = settings['idx']
    else:
        latest_saved_idx = 0

    # prep load
    xml_runner = XMLRunner()
    start_time = time.time()
    bucket = client.get_bucket(gcp_project_id)
    blob = bucket.get_blob('downloads/federal/irs/index_' + str(year) + '.csv')
    blob = blob.download_as_string().decode('utf-8')
    blob = StringIO(blob)

    # load by looping through all the rows in the index
    actions = []
    failed_object_ids = []
    reader = csv.reader(blob, delimiter=',')
    next(reader) # skip header
    for idx, row in enumerate(reader):

        if time.time() - start_time > 520:
            break

        # skip previously indexed objectss
        if idx < latest_saved_idx:
            continue

        # process the object id
        object_id = row[8]
        if int(object_id[:4]) < 2014: # can't process these
            continue

        # process the submission date
        sub_date = row[4]
        try:
            sub_date = datetime.datetime.strptime(sub_date, '%m/%d/%Y %I:%M:%S %p')
        except:
            try:
                sub_date = datetime.datetime.strptime(sub_date, '%m/%d/%Y')
            except:
                raise

        sub_date = pytz.timezone('US/Eastern').localize(sub_date)
        sub_date = sub_date.strftime("%Y-%m-%dT%H:%M:%S%z")

        # grab the filing
        try:
            filing = xml_runner.run_filing(object_id)
            schedules = filing.get_result()
        except (RuntimeError, InvalidXMLException) as e:
            logger.error(object_id, str(e))
            failed_object_ids.append(object_id)
            continue

        if schedules is not None:

            xml = utilities.get_xml_parts(schedules)
            xml = utilities.clean_xml(xml)

            if 'IRS990EZ' in xml:
                index = '990ez'
            elif 'IRS990PF' in xml:
                index = '990pf'
            else:
                index = '990'

            actions.append({
                '_op_type': 'index',
                '_index': 'federal_irs_' + index,
                '_id': object_id,
                '_source': {
                    'row': {
                        'return_id': str(row[0]),
                        'filing_type': row[1],
                        'ein': str(row[2]),
                        'tax_period': row[3],
                        'sub_date': sub_date,
                        'taxpayer_name': row[5],
                        'return_type': str(row[6]),
                        'dln': str(row[7]),
                        'object_id': object_id
                    },
                    'obj': xml,
                    'context': {
                        'last_indexed': datetime.datetime.now(datetime.timezone.utc)
                    }
                }
            })

        if len(actions) >= 1000:
            helpers.bulk(es, actions)
            logger.info('ELASTICSEARCH UPDATED' + ' - ' + str(len(actions)) + ' docs')
            actions = []

    # index all docs into elasticsearch
    if actions:
        helpers.bulk(es, actions)
        logger.info('ELASTICSEARCH UPDATED' + ' - ' + str(len(actions)) + ' docs')

    # update Firestore
    update = {
        "idx": idx,
        "last_updated": datetime.datetime.now(datetime.timezone.utc)
    }
    if len(failed_object_ids) > 0:
        update['failed_object_ids'] = firestore.ArrayUnion(failed_object_ids)
    ref.set(update, merge=True)

    num_remaining_rows = len(list(reader))
    logger.info('FIRESTORE UPDATED - completed: ' + str(idx) + ', remaining: ' + str(num_remaining_rows))
    return num_remaining_rows
예제 #4
0
def filing_990_historical(message, context):
    latest_saved_year = settings['latest_year_file']
    latest_saved_idx = settings['latest_index_in_file']
    failed_object_ids = settings['failed_object_ids']
    if latest_saved_year == 2010:
        return True
    xml_runner = XMLRunner()
    start_time = time.time()
    exit = False
    files = os.listdir('indexes')
    actions = []
    for _file in files:
        if _file != str(latest_saved_year) + '.csv':
            continue
        with open('indexes/' + _file, newline='\n') as csvfile:
            reader = csv.reader(csvfile, delimiter=',')
            next(reader)  # skip header
            for idx, row in enumerate(reader):
                if time.time() - start_time > 520:
                    exit = True
                    break
                if idx < latest_saved_idx:
                    continue
                object_id = row[-1]
                try:
                    filing = xml_runner.run_filing(object_id)
                except (RuntimeError, InvalidXMLException) as e:
                    failed_object_ids.append(object_id)
                    continue
                try:
                    schedules = filing.list_schedules()
                    if 'IRS990PF' in schedules:
                        org = org_from_990pf(filing)
                        grants_to_create = grants_from_990pf(filing)
                    elif 'IRS990EZ' in schedules:
                        org = org_from_990ez(filing)
                        grants_to_create = []
                    elif 'IRS990' in schedules:
                        org = org_from_990(filing)
                        grants_to_create = grants_from_990(filing)
                    else:
                        raise RuntimeError('No schedule available to parse.')
                except (RuntimeError, Exception) as e:
                    failed_object_ids.append(object_id)
                    continue
                actions.append({
                    '_op_type':
                    'index',
                    '_index':
                    'irs-990-filing',
                    '_id':
                    object_id,
                    '_source':
                    json.dumps({
                        'org': org,
                        'grants': grants_to_create
                    })
                })
            else:
                latest_saved_year -= 1
        if exit:
            break
    if actions:
        helpers.bulk(es, actions)
        actions = []
        logger.info('ELASTICSEARCH UPDATED')
    settings['latest_year_file'] = latest_saved_year
    settings['latest_index_in_file'] = idx
    settings['failed_object_ids'] = failed_object_ids
    ref.set(settings)
    logger.info('FIRESTORE UPDATED')
    return True
예제 #5
0
class Command(BaseCommand):
    help = '''
    Enter the filings, one by one.
    Loading is done in bulk, though status on the filings is updated one at a time.
   
    '''

    def add_arguments(self, parser):
        # Positional arguments
        parser.add_argument('year', nargs=1, type=int)

    def setup(self):
        # get an XMLRunner -- this is what actually does the parsing
        self.xml_runner = XMLRunner()
        self.accumulator = Accumulator()

    def process_sked(self, sked):
        """ Enter just one schedule """
        #print("Processing schedule %s" % sked['schedule_name'])
        for part in sked['schedule_parts'].keys():
            partname = part
            partdata = sked['schedule_parts'][part]
            #print("part %s %s" % (partname, partdata))

            self.accumulator.add_model(partname, partdata)

        for groupname in sked['groups'].keys():
            for groupdata in sked['groups'][groupname]:
                #print("group %s %s" % (groupname, groupdata) )
                self.accumulator.add_model(groupname, groupdata)

    def run_filing(self, filing):
        object_id = filing.object_id
        print("run_filing %s" % object_id)

        parsed_filing = self.xml_runner.run_filing(object_id)
        if not parsed_filing:
            print(
                "Skipping filing %s(filings with pre-2013 filings are skipped)\n row details: %s"
                % (filing, metadata_row))
            return None

        schedule_list = parsed_filing.list_schedules()
        #print("sked list is %s" % schedule_list)

        result = parsed_filing.get_result()

        keyerrors = parsed_filing.get_keyerrors()
        schema_version = parsed_filing.get_version()
        ## This could be disabled if we don't care about the schema version
        ## This is one save per loaded row...
        if filing.schema_version != schema_version:
            filing.schema_version = schema_version
            filing.save()

        if keyerrors:
            # If we find keyerrors--xpaths that are missing from our spec, note it
            print("Key error %s")
            has_keyerrors = len(keyerrors) > 0
            print("keyerror: %s" % keyerrors)
            filing.error_details = str(keyerrors)
            filing.key_error_count = len(keyerrors)
            filing.is_error = has_keyerrors
            filing.save()

        if result:
            for sked in result:
                self.process_sked(sked)
        else:
            print("Filing not parsed %s " % object_id)

    def handle(self, *args, **options):

        year = int(options['year'][0])
        if year not in [2014, 2015, 2016, 2017, 2018]:
            raise RuntimeError(
                "Illegal year `%s`. Please enter a year between 2014 and 2018"
                % year)

        print("Running filings during year %s" % year)
        self.setup()

        process_count = 0
        while True:
            filings = Filing.objects.filter(submission_year=year).exclude(
                parse_complete=True)[:100]
            if not filings:
                print("Done")
                break

            object_id_list = [f.object_id for f in filings]

            # record that processing has begun
            Filing.objects.filter(object_id__in=object_id_list).update(
                parse_started=True)

            for filing in filings:
                #print("Handling id %s" % filing.object_id)
                self.run_filing(filing)
                process_count += 1
                if process_count % 1000 == 0:
                    print("Handled %s filings" % process_count)

            # commit anything that's left
            self.accumulator.commit_all()
            # record that all are complete
            Filing.objects.filter(object_id__in=object_id_list).update(
                process_time=datetime.now(), parse_complete=True)
            print("Processed a total of %s filings" % process_count)
예제 #6
0
def analyze990(filing_number):
    xml_runner = XMLRunner()
    parsed_filing = xml_runner.run_filing(filing_number)
    result = parsed_filing.get_csv_result()
    print(result)
예제 #7
0
class DownloadWorker(Thread):
    def add_arguments(self, parser):
        # Positional arguments
        parser.add_argument('year', nargs=1, type=int)

    def setup(self):
        # get an XMLRunner -- this is what actually does the parsing
        self.xml_runner = XMLRunner()
        self.accumulator = Accumulator()

    def process_sked(self, sked):
        """ Enter just one schedule """
        #print("Processing schedule %s" % sked['schedule_name'])
        for part in sked['schedule_parts'].keys():
            partname = part
            partdata = sked['schedule_parts'][part]
            #print("part %s %s" % (partname, partdata))

            self.accumulator.add_model(partname, partdata)

        for groupname in sked['groups'].keys():
            for groupdata in sked['groups'][groupname]:
                #print("group %s %s" % (groupname, groupdata) )
                self.accumulator.add_model(groupname, groupdata)

    def run_filing(self, filing):
        #        print (filing)

        object_id = filing.object_id

        parsed_filing = self.xml_runner.run_filing(object_id)
        if not parsed_filing:
            print(
                "Skipping filing %s(filings with pre-2013 filings are skipped)\n row details: %s"
                % (filing, metadata_row))
            return None

        schedule_list = parsed_filing.list_schedules()
        # print("sked list is %s" % schedule_list)

        result = parsed_filing.get_result()

        keyerrors = parsed_filing.get_keyerrors()

        if keyerrors:
            # If we find keyerrors--xpaths that are missing from our spec, note it
            print("Key error %s")
            has_keyerrors = len(keyerrors) > 0
            print("keyerror: %s" % keyerrors)
            filing.error_details = str(keyerrors)
            filing.key_error_count = len(keyerrors)
            filing.is_error = has_keyerrors
            filing.save()

        if result:
            for sked in result:
                self.process_sked(sked)
        else:
            print("Filing not parsed %s " % object_id)

    def __init__(self, queue):
        Thread.__init__(self)
        self.queue = queue

    def run(self):
        self.xml_runner = XMLRunner()
        self.accumulator = Accumulator()
        while True:
            filing = self.queue.get()
            self.run_filing(filing)
            self.queue.task_done()
        connection.close()
예제 #8
0
def filing_990_historical(message, context):
    today = datetime.datetime.today()
    year = today.year
    download_current_year_index(year)
    latest_saved_year = settings['latest_year_file']
    latest_saved_idx = settings['latest_index_in_file']
    failed_object_ids = settings['failed_object_ids']
    xml_runner = XMLRunner()
    start_time = time.time()
    actions = []
    with open('/tmp/' + str(year) + '.csv', newline='\n') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        next(reader)  # skip header
        for idx, row in enumerate(reader):
            if time.time() - start_time > 520:
                break
            if idx < latest_saved_idx:
                continue
            object_id = row[-2]
            if object_id in saved_object_ids or object_id in failed_object_ids:
                continue
            try:
                filing = xml_runner.run_filing(object_id)
            except (RuntimeError, InvalidXMLException) as e:
                failed_object_ids.append(object_id)
                continue
            try:
                schedules = filing.list_schedules()
                if 'IRS990PF' in schedules:
                    org = org_from_990pf(filing)
                    grants_to_create = grants_from_990pf(filing)
                elif 'IRS990EZ' in schedules:
                    org = org_from_990ez(filing)
                    grants_to_create = []
                elif 'IRS990' in schedules:
                    org = org_from_990(filing)
                    grants_to_create = grants_from_990(filing)
                else:
                    raise RuntimeError('No schedule available to parse.')
            except (RuntimeError, Exception) as e:
                failed_object_ids.append(object_id)
                continue
            actions.append({
                '_op_type':
                'index',
                '_index':
                'irs-990-filing',
                '_id':
                object_id,
                '_source':
                json.dumps({
                    'org': org,
                    'grants': grants_to_create
                })
            })
        else:
            if today == datetime.date(day=31, month=12, year=year):
                latest_saved_year += 1
    if actions:
        helpers.bulk(es, actions)
        actions = []
        logger.info('ELASTICSEARCH UPDATED')
    settings['latest_year_file'] = latest_saved_year
    settings['latest_index_in_file'] = idx
    settings['failed_object_ids'] = failed_object_ids
    ref.set(settings)
    logger.info('FIRESTORE UPDATED')
    return True