def generate_d_file(self):
        """ Write file D1 or D2 to an appropriate CSV. """
        log_data = {'message': 'Starting file {} generation'.format(self.job.file_type.letter_name),
                    'message_type': 'ValidatorInfo', 'job_id': self.job.job_id, 'agency_code': self.agency_code,
                    'file_type': self.job.file_type.letter_name, 'start_date': self.job.start_date,
                    'end_date': self.job.end_date, 'filename': self.job.original_filename}
        if self.job.submission_id:
            log_data['submission_id'] = self.job.submission_id
        logger.info(log_data)

        # Get or create a FileRequest for this generation
        current_date = datetime.now().date()
        file_request_params = {
            "job_id": self.job.job_id, "is_cached_file": True, "start_date": self.job.start_date,
            "end_date": self.job.end_date, "agency_code": self.agency_code, "file_type": self.job.file_type.letter_name,
            "agency_type": self.agency_type
        }

        file_request = self.sess.query(FileRequest).filter_by(**file_request_params).one_or_none()
        if not file_request:
            file_request_params["request_date"] = current_date
            file_request = FileRequest(**file_request_params)
            self.sess.add(file_request)
            self.sess.commit()

        # Mark this Job as not from-cache, and mark the FileRequest as the cached version (requested today)
        self.job.from_cached = False
        file_request.is_cached_file = True
        file_request.request_date = current_date
        self.sess.commit()

        # Prepare file data
        file_utils = fileD1 if self.job.file_type.letter_name == 'D1' else fileD2
        local_file = "".join([CONFIG_BROKER['d_file_storage_path'], self.job.original_filename])
        headers = [key for key in file_utils.mapping]
        query_utils = {"file_utils": file_utils, "agency_code": self.agency_code, "agency_type": self.agency_type,
                       "start": self.job.start_date, "end": self.job.end_date, "sess": self.sess}

        # Generate the file and put in S3
        write_query_to_file(local_file, self.job.filename, headers, self.job.file_type.letter_name, self.is_local,
                            d_file_query, query_utils)
        log_data['message'] = 'Finished writing to file: {}'.format(self.job.original_filename)
        logger.info(log_data)
    def generate_a_file(self):
        """ Write file A to an appropriate CSV. """
        log_data = {'message': 'Starting file A generation', 'message_type': 'ValidatorInfo', 'job_id': self.job.job_id,
                    'agency_code': self.agency_code, 'file_type': self.job.file_type.letter_name,
                    'start_date': self.job.start_date, 'end_date': self.job.end_date,
                    'filename': self.job.original_filename}
        logger.info(log_data)

        local_file = "".join([CONFIG_BROKER['d_file_storage_path'], self.job.original_filename])
        headers = [key for key in fileA.mapping]
        # add 3 months to account for fiscal year
        period_date = self.job.end_date + relativedelta(months=3)
        query_utils = {"agency_code": self.agency_code, "period": period_date.month, "year": period_date.year,
                       "sess": self.sess}

        # Generate the file and put in S3
        write_query_to_file(local_file, self.job.filename, headers, self.job.file_type.letter_name, self.is_local,
                            a_file_query, query_utils)
        log_data['message'] = 'Finished writing to file: {}'.format(self.job.original_filename)
        logger.info(log_data)
    def generate_f_file(self):
        """ Write rows from fileF.generate_f_rows to an appropriate CSV. """
        log_data = {
            'message': 'Starting file F generation',
            'message_type': 'ValidatorInfo',
            'job_id': self.job.job_id,
            'submission_id': self.job.submission_id,
            'file_type': 'sub_award'
        }
        logger.info(log_data)

        f_file_contracts_query, f_file_grants_query = fileF.generate_f_file_queries(
            self.job.submission_id)

        # writing locally first without uploading
        log_data['message'] = 'Writing F file contracts to CSV: {}'.format(
            self.job.original_filename)
        logger.info(log_data)
        local_f_file = self.job.filename if self.is_local else self.job.original_filename
        write_query_to_file(self.sess,
                            f_file_contracts_query,
                            local_f_file,
                            generate_headers=True,
                            generate_string=False)

        # writing locally again but then uploading
        log_data['message'] = 'Writing F file grants to CSV: {}'.format(
            self.job.original_filename)
        logger.info(log_data)
        write_stream_query(self.sess,
                           f_file_grants_query,
                           self.job.original_filename,
                           self.job.filename,
                           self.is_local,
                           generate_headers=False,
                           generate_string=False)

        log_data['message'] = 'Finished writing F file CSV: {}'.format(
            self.job.original_filename)
        logger.info(log_data)
    def generate_f_file(self):
        """ Write rows from fileF.generate_f_rows to an appropriate CSV. """
        log_data = {'message': 'Starting file F generation', 'message_type': 'ValidatorInfo', 'job_id': self.job.job_id,
                    'submission_id': self.job.submission_id, 'file_type': 'sub_award'}
        logger.info(log_data)

        f_file_contracts_query, f_file_grants_query = fileF.generate_f_file_queries(self.job.submission_id)

        # writing locally first without uploading
        log_data['message'] = 'Writing F file contracts to CSV: {}'.format(self.job.original_filename)
        logger.info(log_data)
        local_f_file = self.job.filename if self.is_local else self.job.original_filename
        write_query_to_file(self.sess, f_file_contracts_query, local_f_file, generate_headers=True,
                            generate_string=False)

        # writing locally again but then uploading
        log_data['message'] = 'Writing F file grants to CSV: {}'.format(self.job.original_filename)
        logger.info(log_data)
        write_stream_query(self.sess, f_file_grants_query, self.job.original_filename, self.job.filename,
                           self.is_local, generate_headers=False, generate_string=False)

        log_data['message'] = 'Finished writing F file CSV: {}'.format(self.job.original_filename)
        logger.info(log_data)
예제 #5
0
def export_state_congr_table(sess):
    """ Export the current state of the state congressional table to a file and upload to the public S3 bucket

        Args:
            sess: the database connection
    """
    state_congr_filaname = 'state_congressional.csv'

    logger.info("Exporting state_congressional table to {}".format(
        state_congr_filaname))
    query = sess.query(
        StateCongressional.state_code,
        StateCongressional.congressional_district_no,
        StateCongressional.census_year).filter(
            StateCongressional.congressional_district_no.isnot(None))
    write_query_to_file(sess, query, state_congr_filaname)

    logger.info("Uploading {} to {}".format(
        state_congr_filaname, CONFIG_BROKER["public_files_bucket"]))
    s3 = boto3.client('s3', region_name=CONFIG_BROKER['aws_region'])
    s3.upload_file('state_congressional.csv',
                   CONFIG_BROKER["public_files_bucket"],
                   'broker_reference_data/state_congressional.csv')
    os.remove(state_congr_filaname)
예제 #6
0
def pull_offices(sess, filename, update_db, pull_all, updated_date_from,
                 export_office, metrics):
    """ Pull Office data from the Federal Hierarchy API and update the DB, return it as a file, or both.

        Args:
            sess: Current DB session.
            filename: Name of the file to be generated with the API data. If None, no file will be created.
            update_db: Boolean; update the DB tables with the new data from the API.
            pull_all: Boolean; pull all historical data, instead of just the latest.
            updated_date_from: Date to pull data from. Defaults to the date of the most recently updated Office.
            export_office: when provided, name of the file to export the office list to
            metrics: an object containing information for the metrics file
    """
    logger.info('Starting feed: %s',
                API_URL.replace(CONFIG_BROKER['sam']['api_key'], '[API_KEY]'))
    top_sub_levels = ['1', '2']
    office_levels = ['3', '4', '5', '6', '7']
    levels = top_sub_levels + office_levels if filename else office_levels

    if filename:
        logger.info('Creating a file ({}) with the data from this pull'.format(
            filename))
        # Write headers to file
        file_headers = [
            'fhorgid', 'fhorgname', 'fhorgtype', 'description', 'level',
            'status', 'region', 'categoryid', 'effectivestartdate',
            'effectiveenddate', 'createdby', 'createddate', 'updatedby',
            'lastupdateddate', 'fhdeptindagencyorgid', 'fhagencyorgname',
            'agencycode', 'oldfpdsofficecode', 'aacofficecode',
            'cgaclist_0_cgac', 'cgaclist_1_cgac', 'cgaclist_2_cgac',
            'cgaclist_3_cgac', 'cgaclist_4_cgac',
            'fhorgofficetypelist_0_officetype',
            'fhorgofficetypelist_0_officetypestartdate',
            'fhorgofficetypelist_0_officetypeenddate',
            'fhorgofficetypelist_1_officetype',
            'fhorgofficetypelist_1_officetypestartdate',
            'fhorgofficetypelist_1_officetypeenddate',
            'fhorgofficetypelist_2_officetype',
            'fhorgofficetypelist_2_officetypestartdate',
            'fhorgofficetypelist_2_officetypeenddate',
            'fhorgofficetypelist_3_officetype',
            'fhorgofficetypelist_3_officetypeenddate',
            'fhorgofficetypelist_3_officetypestartdate',
            'fhorgaddresslist_0_city', 'fhorgaddresslist_0_state',
            'fhorgaddresslist_0_country_code',
            'fhorgaddresslist_0_addresstype', 'fhorgnamehistory_0_fhorgname',
            'fhorgnamehistory_0_effectivedate',
            'fhorgparenthistory_0_fhfullparentpathid',
            'fhorgparenthistory_0_fhfullparentpathname',
            'fhorgparenthistory_0_effectivedate', 'links_0_href',
            'links_0_rel', 'links_1_href', 'links_1_rel', 'links_2_href',
            'links_2_rel'
        ]
        with open(filename, 'w+') as f:
            csv_writer = csv.writer(f,
                                    delimiter=',',
                                    quotechar='"',
                                    quoting=csv.QUOTE_ALL)
            csv_writer.writerow(file_headers)

    empty_pull_count = 0
    for level in levels:
        # Create URL with the level parameter
        url_with_params = '{}&level={}'.format(API_URL, level)

        # Add updateddatefrom and status parameters to the URL
        if not pull_all:
            url_with_params += '&updateddatefrom={}&status=all'.format(
                updated_date_from)

        # Retrieve the total count of expected records for this pull
        total_expected_records = get_with_exception_hand(
            url_with_params)['totalrecords']
        metrics['level_{}_records'.format(str(level))] = total_expected_records
        logger.info('{} level-{} record(s) expected'.format(
            str(total_expected_records), str(level)))
        if total_expected_records == 0:
            empty_pull_count += 1
            continue

        limit = 100
        entries_processed = 0
        while True:

            async def _fed_hierarchy_async_get(entries_already_processed):
                response_list = []
                loop = asyncio.get_event_loop()
                futures = [
                    loop.run_in_executor(
                        None, get_with_exception_hand,
                        '{}&limit={}&offset={}'.format(
                            url_with_params, str(limit),
                            str(entries_already_processed +
                                (start_offset * limit))))
                    for start_offset in range(REQUESTS_AT_ONCE)
                ]
                for response in await asyncio.gather(*futures):
                    response_list.append(response)
                    pass
                return response_list

            # End async get requests def

            # Retrieve limit*REQUESTS_AT_ONCE records from the API
            loop = asyncio.get_event_loop()
            full_response = loop.run_until_complete(
                _fed_hierarchy_async_get(entries_processed))

            # Create an object with all the data from the API
            dataframe = pd.DataFrame()
            offices = {}
            inactive_offices = []
            start = entries_processed + 1
            for response_dict in full_response:
                # Process the entry if it isn't an error
                for org in response_dict.get('orglist', []):
                    entries_processed += 1

                    # Add to the file data structure
                    if filename:
                        row = json_normalize(flatten_json(org))
                        dataframe = dataframe.append(row)

                    # Don't process the top_sub_levels, but store them in the fed hierarchy export
                    if level in top_sub_levels:
                        continue

                    # Add to the list of DB objects
                    if update_db:
                        # trim incoming values
                        org = trim_nested_obj(org)

                        # If it's inactive, we don't need all that craziness below, we just need to know which code
                        # to delete
                        if org['status'] == 'INACTIVE':
                            inactive_offices.append(org.get('aacofficecode'))
                            continue

                        agency_code = get_normalized_agency_code(
                            org.get('cgaclist', [{
                                'cgac': None
                            }])[0]['cgac'], org.get('agencycode'))
                        # TEMPORARILY REPLACE Navy, Army, AND Air Force WITH DOD
                        if agency_code in ['017', '021', '057']:
                            agency_code = '097'
                        if not org.get('aacofficecode') or not org.get(
                                'agencycode') or not agency_code:
                            # Item from Fed Hierarchy is missing necessary data, ignore it
                            continue
                        # store all the cgacs/subtiers loaded in from this run, to be filtered later
                        metrics['missing_cgacs'].append(agency_code)
                        metrics['missing_subtier_codes'].append(
                            org.get('agencycode'))
                        new_office = Office(
                            office_code=org.get('aacofficecode'),
                            office_name=org.get('fhorgname'),
                            sub_tier_code=org.get('agencycode'),
                            agency_code=agency_code,
                            contract_funding_office=False,
                            contract_awards_office=False,
                            financial_assistance_awards_office=False,
                            financial_assistance_funding_office=False)

                        for off_type in org.get('fhorgofficetypelist', []):
                            office_type = off_type['officetype'].lower(
                            ).replace(' ', '_')
                            if office_type in [
                                    'contract_funding', 'contract_awards',
                                    'financial_assistance_awards',
                                    'financial_assistance_funding'
                            ]:
                                setattr(new_office, office_type + '_office',
                                        True)

                        offices[org.get('aacofficecode')] = new_office

            if filename and len(dataframe.index) > 0:
                # Ensure headers are handled correctly
                for header in list(dataframe.columns.values):
                    if header not in file_headers:
                        file_headers.append(header)
                        logger.info('Headers missing column: %s', header)

                # Write to file
                with open(filename, 'a') as f:
                    dataframe.to_csv(f,
                                     index=False,
                                     header=False,
                                     columns=file_headers)

            if update_db:
                # combine both lists of offices to determine what offices to delete, only active ones will be re-added
                office_codes = set(offices.keys()).union(set(inactive_offices))
                sess.query(Office).filter(
                    Office.office_code.in_(office_codes)).delete(
                        synchronize_session=False)
                sess.add_all(offices.values())

            logger.info('Processed rows %s-%s', start, entries_processed)
            if entries_processed == total_expected_records:
                # Feed has finished
                break

            if entries_processed > total_expected_records:
                # We have somehow retrieved more records than existed at the beginning of the pull
                logger.error(
                    'Total expected records: {}, Number of records retrieved: {}'
                    .format(total_expected_records, entries_processed))
                sys.exit(2)

    if update_db:
        sess.commit()

    if export_office:
        logger.info(
            'Creating a file ({}) with the data from the database'.format(
                export_office))
        all_offices = sess.query(Office)
        write_query_to_file(sess,
                            all_offices,
                            export_office,
                            generate_headers=True)

    if empty_pull_count == len(levels):
        logger.error('No records retrieved from the Federal Hierarchy API')
        sys.exit(3)

    logger.info('Finished')
예제 #7
0
def generate_d_file(sess, job, agency_code, is_local=True, old_filename=None):
    """ Write file D1 or D2 to an appropriate CSV.

        Args:
            sess: Current database session
            job: Upload Job
            agency_code: FREC or CGAC code for generation
            is_local: True if in local development, False otherwise
            old_filename: Previous version of filename, in cases where reverting to old file is necessary
    """
    log_data = {
        'message_type': 'ValidatorInfo',
        'job_id': job.job_id,
        'file_type': job.file_type.letter_name,
        'agency_code': agency_code,
        'start_date': job.start_date,
        'end_date': job.end_date
    }
    if job.submission_id:
        log_data['submission_id'] = job.submission_id

    # find current date and date of last FPDS pull
    current_date = datetime.now().date()
    last_update = sess.query(FPDSUpdate).one_or_none()
    fpds_date = last_update.update_date if last_update else current_date

    # check if FileRequest already exists with this job_id, if not, create one
    file_request = sess.query(FileRequest).filter(
        FileRequest.job_id == job.job_id).one_or_none()
    if not file_request:
        file_request = FileRequest(request_date=current_date,
                                   job_id=job.job_id,
                                   start_date=job.start_date,
                                   end_date=job.end_date,
                                   agency_code=agency_code,
                                   is_cached_file=False,
                                   file_type=job.file_type.letter_name)
        sess.add(file_request)

    # determine if anything needs to be done at all
    exists = file_request.is_cached_file
    if exists and not (job.file_type.letter_name == 'D1'
                       and file_request.request_date < fpds_date):
        # this is the up-to-date cached version of the generated file
        # reset the file names on the upload Job
        log_data[
            'message'] = '{} file has already been generated by this job'.format(
                job.file_type.letter_name)
        logger.info(log_data)

        filepath = CONFIG_BROKER['broker_files'] if is_local else "".join(
            [str(job.submission_id), "/"])
        job.filename = "".join([filepath, old_filename])
        job.original_filename = old_filename
        job.from_cached = False

        if job.submission_id:
            # reset the file names on the validation job
            val_job = sess.query(Job).filter(
                Job.submission_id == job.submission_id,
                Job.file_type_id == job.file_type_id, Job.job_type_id ==
                JOB_TYPE_DICT['csv_record_validation']).one_or_none()
            if val_job:
                val_job.filename = "".join([filepath, old_filename])
                val_job.original_filename = old_filename
        sess.commit()
    else:
        # search for potential parent FileRequests
        parent_file_request = None
        if not exists:
            # attempt to retrieve a parent request
            parent_query = sess.query(FileRequest).\
                filter(FileRequest.file_type == job.file_type.letter_name, FileRequest.start_date == job.start_date,
                       FileRequest.end_date == job.end_date, FileRequest.agency_code == agency_code,
                       FileRequest.is_cached_file.is_(True))

            # filter D1 FileRequests by the date of the last FPDS pull
            if job.file_type.letter_name == 'D1':
                parent_query = parent_query.filter(
                    FileRequest.request_date >= fpds_date)

            # mark FileRequest with parent job_id
            parent_file_request = parent_query.one_or_none()
            file_request.parent_job_id = parent_file_request.job_id if parent_file_request else None
        sess.commit()

        if parent_file_request:
            # parent exists; copy parent data to this job
            copy_parent_file_request_data(sess, file_request.job,
                                          parent_file_request.job, is_local)
        else:
            # no cached file, or cached file is out-of-date
            log_data['message'] = 'Starting file {} generation'.format(
                job.file_type.letter_name)
            log_data['file_name'] = job.original_filename
            logger.info(log_data)

            # mark this Job as not from-cache, and mark the FileRequest as the cached version (requested today)
            job.from_cached = False
            file_request.is_cached_file = True
            file_request.request_date = current_date
            sess.commit()

            # actually generate the file
            file_utils = fileD1 if job.file_type.letter_name == 'D1' else fileD2
            local_file = "".join(
                [CONFIG_BROKER['d_file_storage_path'], job.original_filename])
            headers = [key for key in file_utils.mapping]
            query_utils = {
                "file_utils": file_utils,
                "agency_code": agency_code,
                "start": job.start_date,
                "end": job.end_date,
                "sess": sess
            }
            write_query_to_file(local_file, job.filename, headers,
                                job.file_type.letter_name, is_local,
                                d_file_query, query_utils)
            log_data['message'] = 'Finished writing to file: {}'.format(
                job.original_filename)
            logger.info(log_data)

    log_data['message'] = 'Finished file {} generation'.format(
        job.file_type.letter_name)
    logger.info(log_data)
예제 #8
0
def pull_offices(sess, filename, update_db, pull_all, updated_date_from,
                 export_office, metrics):
    """ Pull Office data from the Federal Hierarchy API and update the DB, return it as a file, or both.

        Args:
            sess: Current DB session.
            filename: Name of the file to be generated with the API data. If None, no file will be created.
            update_db: Boolean; update the DB tables with the new data from the API.
            pull_all: Boolean; pull all historical data, instead of just the latest.
            updated_date_from: Date to pull data from. Defaults to the date of the most recently updated Office.
            export_office: when provided, name of the file to export the office list to
            metrics: an object containing information for the metrics file
    """
    logger.info(
        'Starting feed: %s',
        API_URL.replace(CONFIG_BROKER['sam']['federal_hierarchy_api_key'],
                        "[API_KEY]"))
    top_sub_levels = ["1", "2"]
    office_levels = ["3", "4", "5", "6", "7"]
    levels = top_sub_levels + office_levels if filename else office_levels

    if filename:
        logger.info("Creating a file ({}) with the data from this pull".format(
            filename))
        # Write headers to file
        file_headers = [
            "fhorgid", "fhorgname", "fhorgtype", "description", "level",
            "status", "region", "categoryid", "effectivestartdate",
            "effectiveenddate", "createdby", "createddate", "updatedby",
            "lastupdateddate", "fhdeptindagencyorgid", "fhagencyorgname",
            "agencycode", "oldfpdsofficecode", "aacofficecode",
            "cgaclist_0_cgac", "fhorgofficetypelist_0_officetype",
            "fhorgofficetypelist_0_officetypestartdate",
            "fhorgofficetypelist_0_officetypeenddate",
            "fhorgofficetypelist_1_officetype",
            "fhorgofficetypelist_1_officetypestartdate",
            "fhorgofficetypelist_1_officetypeenddate",
            "fhorgofficetypelist_2_officetype",
            "fhorgofficetypelist_2_officetypestartdate",
            "fhorgofficetypelist_2_officetypeenddate",
            "fhorgaddresslist_0_city", "fhorgaddresslist_0_state",
            "fhorgaddresslist_0_country_code",
            "fhorgaddresslist_0_addresstype", "fhorgnamehistory_0_fhorgname",
            "fhorgnamehistory_0_effectivedate",
            "fhorgparenthistory_0_fhfullparentpathid",
            "fhorgparenthistory_0_fhfullparentpathname",
            "fhorgparenthistory_0_effectivedate", "links_0_href",
            "links_0_rel", "links_1_href", "links_1_rel", "links_2_href",
            "links_2_rel"
        ]
        with open(filename, 'w+') as f:
            csv_writer = csv.writer(f,
                                    delimiter=',',
                                    quotechar='"',
                                    quoting=csv.QUOTE_ALL)
            csv_writer.writerow(file_headers)

    empty_pull_count = 0
    for level in levels:
        # Create URL with the level parameter
        url_with_params = "{}&level={}".format(API_URL, level)

        # Add updateddatefrom parameter to the URL
        if not pull_all:
            url_with_params += "&updateddatefrom={}".format(updated_date_from)

        # Retrieve the total count of expected records for this pull
        total_expected_records = json.loads(
            requests.get(url_with_params, timeout=60).text)['totalrecords']
        metrics['level_{}_records'.format(str(level))] = total_expected_records
        logger.info('{} level-{} record(s) expected'.format(
            str(total_expected_records), str(level)))
        if total_expected_records == 0:
            empty_pull_count += 1
            continue

        limit = 100
        entries_processed = 0
        while True:

            async def _fed_hierarchy_async_get(entries_already_processed):
                response_list = []
                loop = asyncio.get_event_loop()
                futures = [
                    loop.run_in_executor(
                        None, get_with_exception_hand,
                        "{}&limit={}&offset={}".format(
                            url_with_params, str(limit),
                            str(entries_already_processed +
                                (start_offset * limit))))
                    for start_offset in range(REQUESTS_AT_ONCE)
                ]
                for response in await asyncio.gather(*futures):
                    response_list.append(response.text)
                    pass
                return response_list

            # End async get requests def

            # Retrieve limit*REQUESTS_AT_ONCE records from the API
            loop = asyncio.get_event_loop()
            full_response = loop.run_until_complete(
                _fed_hierarchy_async_get(entries_processed))

            # Create an object with all the data from the API
            dataframe = pd.DataFrame()
            offices = {}
            start = entries_processed + 1
            for next_resp in full_response:
                response_dict = json.loads(next_resp)

                # We get errors back as regular JSON, need to catch them somewhere
                if response_dict.get('error'):
                    err = response_dict.get('error')
                    logger.error(
                        "An error of type {} occurred. Message: {}".format(
                            err['code'], err['message']))
                    sys.exit(2)

                # Process the entry if it isn't an error
                for org in response_dict.get('orglist', []):
                    entries_processed += 1

                    # Add to the file data structure
                    if filename:
                        row = json_normalize(flatten_json(org))
                        dataframe = dataframe.append(row)

                    # Don't process the top_sub_levels, but store them in the fed hierarchy export
                    if level in top_sub_levels:
                        continue

                    # Add to the list of DB objects
                    if update_db:
                        agency_code = get_normalized_agency_code(
                            org.get('cgaclist', [{
                                'cgac': None
                            }])[0]['cgac'], org.get('agencycode'))
                        # TEMPORARILY REPLACE Navy, Army, AND Air Force WITH DOD
                        if agency_code in ['017', '021', '057']:
                            agency_code = '097'
                        if not org.get('aacofficecode') or not org.get(
                                'agencycode') or not agency_code:
                            # Item from Fed Hierarchy is missing necessary data, ignore it
                            continue

                        new_office = Office(
                            office_code=org.get('aacofficecode'),
                            office_name=org.get('fhorgname'),
                            sub_tier_code=org.get('agencycode'),
                            agency_code=agency_code,
                            contract_funding_office=False,
                            contract_awards_office=False,
                            financial_assistance_awards_office=False,
                            financial_assistance_funding_office=False)

                        for off_type in org.get('fhorgofficetypelist', []):
                            office_type = off_type['officetype'].lower(
                            ).replace(" ", "_")
                            if office_type in [
                                    'contract_funding', 'contract_awards',
                                    'financial_assistance_awards',
                                    'financial_assistance_funding'
                            ]:
                                setattr(new_office, office_type + '_office',
                                        True)

                        offices[org.get('aacofficecode')] = new_office

            if filename and len(dataframe.index) > 0:
                # Ensure headers are handled correctly
                for header in list(dataframe.columns.values):
                    if header not in file_headers:
                        file_headers.append(header)
                        logger.info("Headers missing column: %s", header)

                # Write to file
                with open(filename, 'a') as f:
                    dataframe.to_csv(f,
                                     index=False,
                                     header=False,
                                     columns=file_headers)

            if update_db:
                office_codes = set(offices.keys())
                sess.query(Office).filter(
                    Office.office_code.in_(office_codes)).delete(
                        synchronize_session=False)
                sess.add_all(offices.values())

            logger.info("Processed rows %s-%s", start, entries_processed)
            if entries_processed == total_expected_records:
                # Feed has finished
                break

            if entries_processed > total_expected_records:
                # We have somehow retrieved more records than existed at the beginning of the pull
                logger.error(
                    "Total expected records: {}, Number of records retrieved: {}"
                    .format(total_expected_records, entries_processed))
                sys.exit(2)

    if update_db:
        sess.commit()

    if export_office:
        logger.info(
            "Creating a file ({}) with the data from the database".format(
                export_office))
        all_offices = sess.query(Office)
        write_query_to_file(sess,
                            all_offices,
                            export_office,
                            generate_headers=True)

    if empty_pull_count == len(levels):
        logger.error("No records retrieved from the Federal Hierarchy API")
        sys.exit(3)

    logger.info("Finished")
def generate_d_file(file_type,
                    agency_code,
                    start,
                    end,
                    job_id,
                    upload_name,
                    is_local,
                    submission_id=None):
    """Write file D1 or D2 to an appropriate CSV.

        Args:
            file_type - File type as either "D1" or "D2"
            agency_code - FREC or CGAC code for generation
            start - Beginning of period for D file
            end - End of period for D file
            job_id - Job ID for upload job
            upload_name - File key to use on S3
            is_local - True if in local development, False otherwise
    """
    log_data = {
        'message_type': 'BrokerInfo',
        'job_id': job_id,
        'file_type': FILE_TYPE_DICT_LETTER_NAME[file_type],
        'agency_code': agency_code,
        'start_date': start,
        'end_date': end
    }
    if submission_id:
        log_data['submission_id'] = submission_id

    with job_context(job_id, is_local) as sess:
        current_date = datetime.now().date()

        # check if FileRequest already exists with this job_id, if not, create one
        file_request = sess.query(FileRequest).filter(
            FileRequest.job_id == job_id).one_or_none()
        if not file_request:
            file_request = FileRequest(request_date=current_date,
                                       job_id=job_id,
                                       start_date=start,
                                       end_date=end,
                                       agency_code=agency_code,
                                       file_type=file_type,
                                       is_cached_file=False)
            sess.add(file_request)

        # search for potential parent FileRequests
        parent_file_request = None
        if not file_request.is_cached_file:
            parent_request_query = sess.query(FileRequest).\
                filter(FileRequest.file_type == file_type, FileRequest.start_date == start, FileRequest.end_date == end,
                       FileRequest.agency_code == agency_code, FileRequest.is_cached_file.is_(True))

            # filter D1 FileRequests by the date of the last FPDS pull
            if file_type == 'D1':
                last_update = sess.query(FPDSUpdate).one_or_none()
                fpds_date = last_update.update_date if last_update else current_date
                parent_request_query = parent_request_query.filter(
                    FileRequest.request_date >= fpds_date)

            # mark FileRequest with parent job_id
            parent_file_request = parent_request_query.one_or_none()
            file_request.parent_job_id = parent_file_request.job_id if parent_file_request else None
        sess.commit()

        if file_request.is_cached_file:
            # this is the cached file, no need to do anything
            log_data[
                'message'] = '{} file has already been generated by this job'.format(
                    file_type)
            logger.info(log_data)
        elif parent_file_request:
            # copy parent data to this job if parent is not still running
            if parent_file_request.job.job_status_id != JOB_STATUS_DICT[
                    'running']:
                copy_parent_file_request_data(sess, file_request.job,
                                              parent_file_request.job,
                                              file_type, is_local)
        else:
            # no cached file
            file_name = upload_name.split('/')[-1]
            log_data['message'] = 'Starting file {} generation'.format(
                file_type)
            log_data['file_name'] = file_name
            logger.info(log_data)

            file_utils = fileD1 if file_type == 'D1' else fileD2
            local_filename = "".join(
                [CONFIG_BROKER['d_file_storage_path'], file_name])
            headers = [key for key in file_utils.mapping]

            # actually generate the file
            query_utils = {
                "file_utils": file_utils,
                "agency_code": agency_code,
                "start": start,
                "end": end,
                "sess": sess
            }
            write_query_to_file(local_filename, upload_name, headers,
                                file_type, is_local, d_file_query, query_utils)

            # mark this FileRequest as the cached version
            file_request.is_cached_file = True
            sess.commit()

            log_data['message'] = 'Finished writing to file: {}'.format(
                file_name)
            logger.info(log_data)
    log_data['message'] = 'Finished file {} generation'.format(file_type)
    logger.info(log_data)
def pull_offices(sess, filename, update_db, pull_all, updated_date_from, export_office):
    """ Pull Office data from the Federal Hierarchy API and update the DB, return it as a file, or both.

        Args:
            sess: Current DB session.
            filename: Name of the file to be generated with the API data. If None, no file will be created.
            update_db: Boolean; update the DB tables with the new data from the API.
            pull_all: Boolean; pull all historical data, instead of just the latest.
            updated_date_from: Date to pull data from. Defaults to the date of the most recently updated Office.
    """
    logger.info('Starting feed: %s', API_URL.replace(CONFIG_BROKER['sam']['federal_hierarchy_api_key'], "[API_KEY]"))
    top_sub_levels = ["1", "2"]
    office_levels = ["3", "4", "5", "6", "7"]
    levels = top_sub_levels + office_levels if filename else office_levels

    if filename:
        logger.info("Creating a file ({}) with the data from this pull".format(filename))
        # Write headers to file
        file_headers = [
            "fhorgid", "fhorgname", "fhorgtype", "description", "level", "status", "region", "categoryid",
            "effectivestartdate", "effectiveenddate", "createdby", "createddate", "updatedby", "lastupdateddate",
            "fhdeptindagencyorgid", "fhagencyorgname", "agencycode", "oldfpdsofficecode", "aacofficecode",
            "cgaclist_0_cgac", "fhorgofficetypelist_0_officetype", "fhorgofficetypelist_0_officetypestartdate",
            "fhorgofficetypelist_0_officetypeenddate", "fhorgofficetypelist_1_officetype",
            "fhorgofficetypelist_1_officetypestartdate", "fhorgofficetypelist_1_officetypeenddate",
            "fhorgofficetypelist_2_officetype", "fhorgofficetypelist_2_officetypestartdate",
            "fhorgofficetypelist_2_officetypeenddate", "fhorgaddresslist_0_city", "fhorgaddresslist_0_state",
            "fhorgaddresslist_0_country_code", "fhorgaddresslist_0_addresstype", "fhorgnamehistory_0_fhorgname",
            "fhorgnamehistory_0_effectivedate", "fhorgparenthistory_0_fhfullparentpathid",
            "fhorgparenthistory_0_fhfullparentpathname", "fhorgparenthistory_0_effectivedate", "links_0_href",
            "links_0_rel", "links_1_href", "links_1_rel", "links_2_href", "links_2_rel"]
        with open(filename, 'w+') as f:
            csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
            csv_writer.writerow(file_headers)

    empty_pull_count = 0
    for level in levels:
        # Create URL with the level parameter
        url_with_params = "{}&level={}".format(API_URL, level)

        # Add updateddatefrom parameter to the URL
        if not pull_all:
            url_with_params += "&updateddatefrom={}".format(updated_date_from)

        # Retrieve the total count of expected records for this pull
        total_expected_records = json.loads(requests.get(url_with_params, timeout=60).text)['totalrecords']
        logger.info('{} level-{} record(s) expected'.format(str(total_expected_records), str(level)))
        if total_expected_records == 0:
            empty_pull_count += 1
            continue

        limit = 100
        entries_processed = 0
        while True:
            async def _fed_hierarchy_async_get(entries_already_processed):
                response_list = []
                loop = asyncio.get_event_loop()
                futures = [
                    loop.run_in_executor(
                        None,
                        get_with_exception_hand,
                        "{}&limit={}&offset={}".format(url_with_params, str(limit),
                                                       str(entries_already_processed + (start_offset * limit)))
                    )
                    for start_offset in range(REQUESTS_AT_ONCE)
                ]
                for response in await asyncio.gather(*futures):
                    response_list.append(response.text)
                    pass
                return response_list
            # End async get requests def

            # Retrieve limit*REQUESTS_AT_ONCE records from the API
            loop = asyncio.get_event_loop()
            full_response = loop.run_until_complete(_fed_hierarchy_async_get(entries_processed))

            # Create an object with all the data from the API
            dataframe = pd.DataFrame()
            offices = {}
            start = entries_processed + 1
            for next_resp in full_response:
                response_dict = json.loads(next_resp)

                for org in response_dict.get('orglist', []):
                    entries_processed += 1

                    # Add to the file data structure
                    if filename:
                        row = json_normalize(flatten_json(org))
                        dataframe = dataframe.append(row)

                    # Don't process the top_sub_levels, but store them in the fed hierarchy export
                    if level in top_sub_levels:
                        continue

                    # Add to the list of DB objects
                    if update_db:
                        agency_code = get_normalized_agency_code(org.get('cgaclist', [{'cgac': None}])[0]['cgac'],
                                                                 org.get('agencycode'))
                        # TEMPORARILY REPLACE Navy, Army, AND Air Force WITH DOD
                        if agency_code in ['017', '021', '057']:
                            agency_code = '097'
                        if not org.get('aacofficecode') or not org.get('agencycode') or not agency_code:
                            # Item from Fed Hierarchy is missing necessary data, ignore it
                            continue

                        new_office = Office(office_code=org.get('aacofficecode'), office_name=org.get('fhorgname'),
                                            sub_tier_code=org.get('agencycode'), agency_code=agency_code,
                                            funding_office=False, contracting_office=False, grant_office=False)

                        for off_type in org.get('fhorgofficetypelist', []):
                            office_type = off_type['officetype'].lower()
                            if office_type == 'financial assistance':
                                office_type = 'grant'
                            if office_type in ['contracting', 'funding', 'grant']:
                                setattr(new_office, office_type + '_office', True)

                        offices[org.get('aacofficecode')] = new_office

            if filename and len(dataframe.index) > 0:
                # Ensure headers are handled correctly
                for header in list(dataframe.columns.values):
                    if header not in file_headers:
                        file_headers.append(header)
                        logger.info("Headers missing column: %s", header)

                # Write to file
                with open(filename, 'a') as f:
                    dataframe.to_csv(f, index=False, header=False, columns=file_headers)

            if update_db:
                office_codes = set(offices.keys())
                sess.query(Office).filter(Office.office_code.in_(office_codes)).delete(synchronize_session=False)
                sess.add_all(offices.values())

            logger.info("Processed rows %s-%s", start, entries_processed)
            if entries_processed == total_expected_records:
                # Feed has finished
                break

            if entries_processed > total_expected_records:
                # We have somehow retrieved more records than existed at the beginning of the pull
                logger.error("Total expected records: {}, Number of records retrieved: {}".format(
                    total_expected_records, entries_processed))
                sys.exit(2)

    if update_db:
        sess.commit()

    if export_office:
        logger.info("Creating a file ({}) with the data from the database".format(export_office))
        all_offices = sess.query(Office)
        write_query_to_file(sess, all_offices, export_office, generate_headers=True)

    if empty_pull_count == len(levels):
        logger.error("No records retrieved from the Federal Hierarchy API")
        sys.exit(3)

    logger.info("Finished")