def get_record_processor(catcodes, candidates, committees): return chain_filters( CSVFieldVerifier(), # transaction filters FieldAdder('transaction_namespace', CRP_TRANSACTION_NAMESPACE), FieldMerger({'transaction_id': ('cycle', 'fec_rec_no')}, lambda cycle, fecid: 'pac2cand:%s:%s' % (cycle, fecid), keep_fields=True), FieldMerger({'transaction_type': ('type', )}, lambda t: t.strip().lower()), # date stamp FieldModifier('date', parse_date_iso), # contributor and recipient fields ContributorFilter(committees), FieldRenamer({'contributor_ext_id': 'pac_id'}), FieldAdder('contributor_type', 'C'), Pac2CandRecipientFilter(candidates), FieldAdder('recipient_type', 'P'), # catcode CatCodeFilter('contributor', catcodes), # add static fields FieldAdder('is_amendment', False), FieldMerger({'candidacy_status': ('curr_cand', 'cycle_cand')}, lambda curr, cycle: "" if cycle != 'Y' else curr == 'Y' and cycle == 'Y', keep_fields=False), # filter through spec SpecFilter(SPEC))
def test_field_renamer(self): fr = FieldRenamer({'x':'a', 'y':'b'}) expected_data = [{'x':1, 'y':2, 'c':3}, {'x':5, 'y':5, 'c':5}, {'x':1, 'y':10, 'c':100}] self.assert_filter_result(fr, expected_data)
def lobbying_handler(inpath, outpath, infields, outfields): run_recipe( CSVSource(open(inpath), fieldnames=infields, quotechar='|'), UnicodeFilter(), FieldRemover('Source'), FieldMerger({'registrant_name': ('Registrant', 'RegistrantRaw')}, name_proc), FieldMerger({'registrant_is_firm': ('IsFirm', )}, yn_proc), FieldMerger({'client_name': ('Client', 'Client_raw')}, name_proc), FieldMerger({'amount': ('Amount', )}, lambda x: float(x or 0)), FieldMerger({'affiliate': ('Affiliate', )}, yn_proc), FieldMerger({'filing_included_nsfs': ('IncludeNSFS', )}, yn_proc), FieldMerger({'include_in_industry_totals': ('Ind', )}, yn_proc), FieldMerger({'use': ('Use', )}, yn_proc), FieldRenamer({ 'transaction_id': 'Uniqid', 'transaction_type': 'Type', 'transaction_type_desc': 'TypeLong', 'year': 'Year', 'client_category': 'Catcode', 'client_parent_name': 'Ultorg', 'filing_type': 'Self', }), #DebugEmitter(), CSVEmitter(open(outpath, 'w'), fieldnames=outfields), )
def get_record_processor(catcodes, candidates, committees): return chain_filters( CSVFieldVerifier(), ContribRecipFilter(), CommitteeFilter(committees), Pac2PacRecipientFilter(candidates, committees), # transaction filters FieldAdder('transaction_namespace', CRP_TRANSACTION_NAMESPACE), FieldMerger({'transaction_id': ('cycle','fec_rec_no')}, lambda cycle, fecid: 'pac2pac:%s:%s' % (cycle, fecid), keep_fields=True), FieldMerger({'transaction_type': ('type',)}, lambda t: t.strip().lower()), # filing reference ID FieldRenamer({'filing_id': 'microfilm'}), # date stamp FieldModifier('date', parse_date_iso), # catcode FieldMerger({'contributor_category': ('real_code',)}, lambda s: s.upper() if s else "", keep_fields=True), FieldMerger({'recipient_category': ('recip_prim_code',)}, lambda s: s.upper() if s else "", keep_fields=True), FieldRenamer({'contributor_city': 'city', 'contributor_state': 'state', 'contributor_zipcode': 'zipcode', 'contributor_occupation': 'fec_occ_emp', 'recipient_party': 'party',}), FieldModifier('contributor_state', lambda s: s.strip().upper() if s else ""), FieldAdder('contributor_type', 'C'), # add static fields FieldAdder('jurisdiction', 'F'), FieldMerger({'is_amendment': ('amend',)}, lambda s: s.strip().upper() != 'N'), FieldMerger({'candidacy_status': ('curr_cand', 'cycle_cand')}, lambda curr, cycle: "" if cycle != 'Y' else curr == 'Y' and cycle == 'Y', keep_fields=False ), # filter through spec SpecFilter(SPEC))
def agency_handler(inpath, outpath, infields, outfields): run_recipe( CSVSource(open(inpath), fieldnames=infields, quotechar='|'), FieldAdder('id', ''), FieldRenamer({ 'transaction': 'UniqID', 'agency_name': 'Agency', 'agency_ext_id': 'AgencyID', }), #DebugEmitter(), CSVEmitter(open(outpath, 'w'), fieldnames=outfields), )
def bills_handler(inpath, outpath, infields, outfields): run_recipe( CSVSource(open(inpath), fieldnames=infields, quotechar='|'), FieldAdder('id', ''), FieldRenamer({ 'bill_id': 'B_ID', 'issue': 'SI_ID', 'congress_no': 'CongNo', 'bill_name': 'Bill_Name', }), #DebugEmitter(), CSVEmitter(open(outpath, 'w'), fieldnames=outfields), )
def run(self): run_recipe( CSVSource(open(self.inpath)), FieldRenamer(self.field_map), FieldRemover('committee_fec_id committee_name report_year report_type is_amendment start_date end_date reporting_period_amount_all semi_annual_amount_all'.split()), BundleFilter(), #FieldModifier('file_num', lambda x: Bundle.objects.get(pk=x)), # Convert any stray floats to integers FieldModifier('amount semi_annual_amount'.split(), \ lambda x: int(round(float(x))) if x else None), NoneFilter(), UnicodeFilter(), CountEmitter(every=500), #DebugEmitter(), SimpleDjangoModelEmitter(LobbyistBundle) )
def lobbyist_handler(inpath, outpath, infields, outfields): run_recipe( CSVSource(open(inpath), fieldnames=infields, quotechar='|'), FieldAdder('id', ''), FieldMerger({'lobbyist_name': ('Lobbyist', 'Lobbyist_raw')}, name_proc), FieldMerger({'member_of_congress': ('FormerCongMem', )}, yn_proc), FieldRenamer({ 'transaction': 'Uniqid', 'year': 'Year', 'lobbyist_ext_id': 'LobbyistID', 'candidate_ext_id': 'CID', 'government_position': 'OfficalPos', }), #DebugEmitter(), CSVEmitter(open(outpath, 'w'), fieldnames=outfields), )
def run(self): run_recipe( CSVSource(open(self.inpath)), FieldModifier('year', lambda x: int(x) if x else None), FieldRenamer({'transaction_id': 'transaction'}), NoneFilter(), TRANSACTION_FILTER, UnicodeFilter(), CountEmitter(every=10000, log=self.log), LoaderEmitter(AgencyLoader( source=self.inpath, description='load from denormalized CSVs', imported_by="loadlobbying (%s)" % os.getenv('LOGNAME', 'unknown'), log=self.log, ), commit_every=100), )
def issue_handler(inpath, outpath, infields, outfields): run_recipe( VerifiedCSVSource(open(inpath, 'r'), fieldnames=infields, quotechar='|'), FieldCountValidator(len(FILE_TYPES['lob_issue'])), CSVFieldVerifier(), FieldRenamer({ 'id': 'SI_ID', 'transaction': 'UniqID', 'general_issue_code': 'IssueID', 'general_issue': 'Issue', 'specific_issue': 'SpecIssue', 'year': 'Year', }), #DebugEmitter(), CSVEmitter(open(outpath, 'w'), fieldnames=outfields), )
def run(self): run_recipe( CSVSource(open(self.inpath)), FieldRenamer(self.field_map), # Values are [N|A]. Convert to boolean. FieldModifier('is_amendment', \ lambda x: x == 'A'), # Convert any stray floats to integers FieldModifier('reporting_period_amount semi_annual_amount'.split(), \ lambda x: int(round(float(x.replace('$','').replace(',','')))) if x else None), # Convert date formats FieldModifier('start_date end_date filing_date'.split(), \ lambda x: datetime.strptime(x, '%m/%d/%Y') if x else None), # TODO: These following two lines (and the field value) need to be thoroughly tested on the next bundling load FieldCopier({'pdf_url': 'first_image_num'}), FieldModifier('pdf_url', \ lambda x: 'http://query.nictusa.com/pdf/{0}/{1}/{1}.pdf'.format(x[-3:], x)), NoneFilter(), UnicodeFilter(), CountEmitter(every=200), #DebugEmitter(), SimpleDjangoModelEmitter(Bundle) )
def get_record_processor(catcodes, candidates, committees): return chain_filters( CSVFieldVerifier(), # transaction filters FieldAdder('transaction_namespace', CRP_TRANSACTION_NAMESPACE), FieldMerger({'transaction_id': ('cycle', 'fec_trans_id')}, lambda cycle, fecid: 'indiv:%s:%s' % (cycle, fecid), keep_fields=True), FieldMerger({'transaction_type': ('type', )}, lambda t: t.strip().lower() if t else '', keep_fields=True), # filing reference ID FieldRenamer({'filing_id': 'microfilm'}), # date stamp FieldModifier('date', parse_date_iso), # rename contributor, organization, and parent_organization fields FieldRenamer({ 'contributor_name': 'contrib', 'parent_organization_name': 'ult_org', }), IndivRecipientFilter(candidates, committees), CommitteeFilter(committees), OrganizationFilter(), # create URNs FieldRenamer({ 'contributor_ext_id': 'contrib_id', 'committee_ext_id': 'cmte_id' }), # address and gender fields FieldRenamer({ 'contributor_address': 'street', 'contributor_city': 'city', 'contributor_state': 'state', 'contributor_zipcode': 'zipcode', 'contributor_gender': 'gender' }), FieldModifier('contributor_state', lambda s: s.upper() if s else ""), FieldModifier('contributor_gender', lambda s: s.upper() if s else ""), # employer/occupation filter FECOccupationFilter(), # catcode CatCodeFilter('contributor', catcodes), # add static fields FieldAdder('contributor_type', 'I'), FieldAdder('is_amendment', False), FieldMerger({'candidacy_status': ('curr_cand', 'cycle_cand')}, lambda curr, cycle: "" if cycle != 'Y' else curr == 'Y' and cycle == 'Y', keep_fields=False), # filter through spec SpecFilter(SPEC))