def run(self): run_recipe( CSVSource(open(self.inpath)), FieldMerger({'bill_type_raw': ['bill_name']}, lambda x: re.sub(r'[^A-Z]*', '', x), keep_fields=True), FieldMerger({'bill_type': ['bill_type_raw']}, lambda x: self.bill_type_map.get(x, None), keep_fields=True), FieldMerger({'bill_no': ['bill_name']}, lambda x: self.digits.match(x).groups()[0] if x and self.digits.match(x) else None, keep_fields=True), NoneFilter(), IssueFilter(), UnicodeFilter(), CountEmitter(every=20000, log=self.log), LoaderEmitter(BillLoader( source=self.inpath, description='load from denormalized CSVs', imported_by="loadlobbying (%s)" % os.getenv('LOGNAME', 'unknown'), log=self.log, ), commit_every=1), )
def lobbying_handler(inpath, outpath, infields, outfields): run_recipe( CSVSource(open(inpath), fieldnames=infields, quotechar='|'), UnicodeFilter(), FieldRemover('Source'), FieldMerger({'registrant_name': ('Registrant', 'RegistrantRaw')}, name_proc), FieldMerger({'registrant_is_firm': ('IsFirm',)}, yn_proc), FieldMerger({'client_name': ('Client', 'Client_raw')}, name_proc), FieldMerger({'amount': ('Amount',)}, lambda x: float(x or 0)), FieldMerger({'affiliate': ('Affiliate',)}, yn_proc), FieldMerger({'filing_included_nsfs': ('IncludeNSFS',)}, yn_proc), FieldMerger({'include_in_industry_totals': ('Ind',)}, yn_proc), FieldMerger({'use': ('Use',)}, yn_proc), FieldRenamer({ 'transaction_id': 'Uniqid', 'transaction_type': 'Type', 'transaction_type_desc': 'TypeLong', 'year': 'Year', 'client_category': 'Catcode', 'client_parent_name': 'Ultorg', 'filing_type': 'Self', }), #DebugEmitter(), CSVEmitter(open(outpath, 'w'), fieldnames=outfields), )
def lobbying_handler(inpath, outpath, infields, outfields): run_recipe( CSVSource(open(inpath), fieldnames=infields, quotechar='|'), UnicodeFilter(), FieldRemover('Source'), FieldMerger({'registrant_name': ('Registrant', 'RegistrantRaw')}, name_proc), FieldMerger({'registrant_is_firm': ('IsFirm', )}, yn_proc), FieldMerger({'client_name': ('Client', 'Client_raw')}, name_proc), FieldMerger({'amount': ('Amount', )}, lambda x: float(x or 0)), FieldMerger({'affiliate': ('Affiliate', )}, yn_proc), FieldMerger({'filing_included_nsfs': ('IncludeNSFS', )}, yn_proc), FieldMerger({'include_in_industry_totals': ('Ind', )}, yn_proc), FieldMerger({'use': ('Use', )}, yn_proc), FieldRenamer({ 'transaction_id': 'Uniqid', 'transaction_type': 'Type', 'transaction_type_desc': 'TypeLong', 'year': 'Year', 'client_category': 'Catcode', 'client_parent_name': 'Ultorg', 'filing_type': 'Self', }), #DebugEmitter(), CSVEmitter(open(outpath, 'w'), fieldnames=outfields), )
def load_cpi_areas(): headers = ('area_code','area_name','display_level','selectable','sort_sequence') url = "ftp://ftp.bls.gov/pub/time.series/cu/cu.area" run_recipe( CSVSource(utils.RemoteFile(url), delimiter='\t'), CSVEmitter(open('bls_areas.csv', 'w'), headers) )
def load_schools(): for index, filename in enumerate(("sc091aai.csv", "sc091akn.csv", "sc091aow.csv")): run_recipe( CSVSource(open(os.path.join(NCES_ROOT, filename))), FieldRemover(("mzip409", "member09", "phone09", "ulocal09", "type09", "level09", "status09")), FieldRenamer( { "school_id": "ncessch", "name": "schnam09", "street": "mstree09", "city": "mcity09", "state": "mstate09", "zipcode": "mzip09", "grade_low": "gslo09", "grade_high": "gshi09", } ), FieldAdder("latitude", None), FieldAdder("longitude", None), FieldAdder("codes", None), MSACoder(), SqliteEmitter(DB, "nces_schools", fieldnames=HEADERS), # DebugEmitter(), CountEmitter(every=100), )
def load_schools(): for filename in ('sc091aai.csv','sc091akn.csv','sc091aow.csv'): run_recipe( CSVSource(open(os.path.join(NCES_ROOT, filename))), FieldRemover(( 'mzip409', 'member09', 'phone09', 'ulocal09', 'type09', 'level09', 'status09', )), FieldRenamer({ 'school_id': 'ncessch', 'name': 'schnam09', 'street': 'mstree09', 'city': 'mcity09', 'state': 'mstate09', 'zipcode': 'mzip09', 'grade_low': 'gslo09', 'grade_high': 'gshi09', }), FieldAdder('code', None), GeoCoder(), SqliteEmitter(DB, 'nces_schools', fieldnames=HEADERS), #DebugEmitter(), )
def load_occupations(): run_recipe( K2LocationSource(), MySQLOccupationEmitter('root', '', 'k2'), #emitters.DebugEmitter(), error_stream = emitters.DebugEmitter(), )
def load_zipcodes(): headers = ( 'country_code','postal_code','name', 'state_name','state_code', 'county_name','county_code', 'community_name','community_code', 'latitude','longitude','accuracy' ) state_histogram = Histogram('state_code') state_histogram.label_length = 2 csv_path = settings.dataset_path('default', filename='zipcodes.txt') run_recipe( CSVSource(open(csv_path), delimiter="\t", fieldnames=headers), FieldKeeper(('postal_code','name','state_code','latitude','longitude')), FieldModifier(('latitude','longitude'), float), FieldMerger({'latlng': ('latitude', 'longitude')}, lambda lat, lng: (lat, lng)), #MongoZipEmitter(), #DebugEmitter(), state_histogram, ) return str(state_histogram)
def process_sopr_filing(sopr_xml_file): from sunlightapi import live_settings as DJ_SETTINGS DJ_APPLABEL = 'lobbyists' saucebrush.run_recipe(lobbyists.parse_filings(sopr_xml_file), # flatten non-list dictionaries & clean up some fields DictFlattener(['filing', 'client', 'registrant']), FieldRemover(['govt_entities', 'affiliated_orgs', 'foreign_entities', 'client_state_or_local_gov', 'client_status', 'filing_affiliated_orgs_url']), FieldRenamer({'filing_date': 'filing_filing_date'}), # process names & dates FieldAdder('client_contact_name', ''), NameCleaner('client_contact_name', prefix='client_contact_', nomatch_name='client_raw_contact_name'), FieldModifier('filing_date', lambda x: x.split('.')[0]), DateCleaner('filing_date', from_format='%Y-%m-%dT%H:%M:%S', to_format='%Y-%m-%d'), # flatten lists Flattener(['issues', 'lobbyists']), FieldCopier({'issues.filing_id': 'filing_id', 'lobbyists.filing_id': 'filing_id'}), # handle lists saucebrush.filters.Splitter({ 'issues':[DjangoModelEmitter(DJ_SETTINGS, DJ_APPLABEL, 'issue')], 'lobbyists':[FieldRemover(['indicator', 'status']), NameCleaner('name', nomatch_name='raw_name'), Unique(), # remove some duplicate lobbyists on a form DjangoModelEmitter(DJ_SETTINGS, DJ_APPLABEL, 'lobbyist') ], }), FieldRemover(['issues', 'lobbyists']), DjangoModelEmitter(DJ_SETTINGS, DJ_APPLABEL, 'filing') )
def load_locations(): conn = MySQLdb.connect( user=settings.MYSQL_USER, passwd=settings.MYSQL_PASS, db=settings.MYSQL_DATABASE, host=settings.MYSQL_HOST, port=settings.MYSQL_PORT, ) cursor = conn.cursor() cursor.execute("""DELETE FROM occupation_category""") cursor.execute("""DELETE FROM occupation""") cursor.close() path = settings.dataset_path(None, filename='occupations.csv') run_recipe( sources.CSVSource(open(path)), ValidOccupationFilter(), CategoryIDFilter(), CategoryEmitter(conn), OccupationEmitter(conn), #emitters.DebugEmitter(), error_stream = emitters.DebugEmitter(), ) conn.commit() conn.close()
def code_a5(): run_recipe( sources.CSVSource(open("/Users/Jeremy/Downloads/A5.csv")), filters.FieldAdder("code", None), MSAFilter(), emitters.CSVEmitter(open("/Users/Jeremy/Downloads/A5.coded.csv", "w"), fieldnames=FIELD_NAMES), )
def load_locations(): run_recipe( K2LocationSource(), DataFilter(), filters.FieldRemover(('_id','ffiec','geo','oes','naccrra','nces','rpp_local','rpp_state')), MySQLLocationEmitter('root', '', 'k2'), #emitters.DebugEmitter(), error_stream = emitters.DebugEmitter(), )
def load_items(): path = os.path.join(CPI_ROOT, "cu.item") headers = ('item_code','item_name','display_level','selectable','sort_sequence') run_recipe( csv.DictReader(local_file(path), delimiter='\t'), #SqliteEmitter(DB, 'cpi_items', fieldnames=headers), DebugEmitter(), )
def load_ffiec(): run_recipe( ffiec_iter(), FieldModifier(('low','high'), float), FieldMerger({'avg': ('low','high')}, lambda x, y: (x + y) / 2, keep_fields=True), #SqliteEmitter(DB, 'ffiec_incomes', fieldnames=HEADERS), GenericMSAFilter(), MongoEmitter(), #DebugEmitter(), )
def load_areas(): path = os.path.join(CPI_ROOT, "cu.area") headers = ('area_code','area_name','msa_code','display_level','selectable','sort_sequence') run_recipe( csv.DictReader(local_file(path), delimiter='\t'), #FieldAdder('msa_code', None), #MSAFilter(), #SqliteEmitter(DB, 'cpi_areas', fieldnames=headers), DebugEmitter(), )
def agency_handler(inpath, outpath, infields, outfields): run_recipe( CSVSource(open(inpath), fieldnames=infields, quotechar='|'), FieldAdder('id', ''), FieldRenamer({ 'transaction': 'UniqID', 'agency_name': 'Agency', 'agency_ext_id': 'AgencyID', }), #DebugEmitter(), CSVEmitter(open(outpath, 'w'), fieldnames=outfields), )
def bills_handler(inpath, outpath, infields, outfields): run_recipe( CSVSource(open(inpath), fieldnames=infields, quotechar='|'), FieldAdder('id', ''), FieldRenamer({ 'bill_id': 'B_ID', 'issue': 'SI_ID', 'congress_no': 'CongNo', 'bill_name': 'Bill_Name', }), #DebugEmitter(), CSVEmitter(open(outpath, 'w'), fieldnames=outfields), )
def load_locations(): csv_path = settings.datasource_path('locations', filename='locations.csv') # load locations into mongodb run_recipe( CSVSource(open(csv_path)), FieldRemover('points'), MSAFilter(), GeoJSONTestFilter(), CensusTestFilter(), GeoFilter(), DebugEmitter(), #MongoDBEmitter(settings.MONGO_DATABASE, 'locations'), )
def issue_handler(inpath, outpath, infields, outfields): run_recipe( CSVSource(open(inpath), fieldnames=infields, quotechar='|'), FieldRenamer({ 'id': 'SI_ID', 'transaction': 'UniqID', 'general_issue_code': 'IssueID', 'general_issue': 'Issue', 'specific_issue': 'SpecIssue', 'year': 'Year', }), FieldModifier(('general_issue', 'specific_issue'), lambda x: x.replace('\n', ' ')), #DebugEmitter(), CSVEmitter(open(outpath, 'w'), fieldnames=outfields), )
def run(self): run_recipe( CSVSource(open(self.inpath)), FieldRenamer(self.field_map), FieldRemover('committee_fec_id committee_name report_year report_type is_amendment start_date end_date reporting_period_amount_all semi_annual_amount_all'.split()), BundleFilter(), #FieldModifier('file_num', lambda x: Bundle.objects.get(pk=x)), # Convert any stray floats to integers FieldModifier('amount semi_annual_amount'.split(), \ lambda x: int(round(float(x))) if x else None), NoneFilter(), UnicodeFilter(), CountEmitter(every=500), #DebugEmitter(), DjangoModelEmitter('settings', LobbyistBundle) )
def calculate_average(locs): def locsource(): for loc in locs: if 'ffiec' in loc and 'diff' in loc['ffiec']: yield loc['ffiec'] sd = stats.StandardDeviation('diff') run_recipe( locsource(), sd, ) print "Average: %s" % sd.average() print "stddev: %s" % sd.value()[0]
def run(self): run_recipe( CSVSource(open(self.inpath)), FieldRenamer(self.field_map), FieldRemover('committee_fec_id committee_name report_year report_type is_amendment start_date end_date reporting_period_amount_all semi_annual_amount_all'.split()), BundleFilter(), #FieldModifier('file_num', lambda x: Bundle.objects.get(pk=x)), # Convert any stray floats to integers FieldModifier('amount semi_annual_amount'.split(), \ lambda x: int(round(float(x))) if x else None), NoneFilter(), UnicodeFilter(), CountEmitter(every=500), #DebugEmitter(), SimpleDjangoModelEmitter(LobbyistBundle) )
def run(self): run_recipe( CSVSource(open(self.inpath)), FieldModifier('year', lambda x: int(x) if x else None), FieldRenamer({'transaction_id': 'transaction'}), NoneFilter(), TRANSACTION_FILTER, UnicodeFilter(), CountEmitter(every=10000, log=self.log), LoaderEmitter(AgencyLoader( source=self.inpath, description='load from denormalized CSVs', imported_by="loadlobbying (%s)" % os.getenv('LOGNAME', 'unknown'), log=self.log, ), commit_every=100), )
def lobbyist_handler(inpath, outpath, infields, outfields): run_recipe( CSVSource(open(inpath), fieldnames=infields, quotechar='|'), FieldAdder('id', ''), FieldMerger({'lobbyist_name': ('Lobbyist', 'Lobbyist_raw')}, name_proc), FieldMerger({'member_of_congress': ('FormerCongMem',)}, yn_proc), FieldRenamer({ 'transaction': 'Uniqid', 'year': 'Year', 'lobbyist_ext_id': 'LobbyistID', 'candidate_ext_id': 'CID', 'government_position': 'OfficalPos', }), #DebugEmitter(), CSVEmitter(open(outpath, 'w'), fieldnames=outfields), )
def issue_handler(inpath, outpath, infields, outfields): run_recipe( VerifiedCSVSource(open(inpath, 'r'), fieldnames=infields, quotechar='|'), FieldCountValidator(len(FILE_TYPES['lob_issue'])), CSVFieldVerifier(), FieldRenamer({ 'id': 'SI_ID', 'transaction': 'UniqID', 'general_issue_code': 'IssueID', 'general_issue': 'Issue', 'specific_issue': 'SpecIssue', 'year': 'Year', }), #DebugEmitter(), CSVEmitter(open(outpath, 'w'), fieldnames=outfields), )
def lobbyist_handler(inpath, outpath, infields, outfields): run_recipe( CSVSource(open(inpath), fieldnames=infields, quotechar='|'), FieldAdder('id', ''), FieldMerger({'lobbyist_name': ('Lobbyist', 'Lobbyist_raw')}, name_proc), FieldMerger({'member_of_congress': ('FormerCongMem', )}, yn_proc), FieldRenamer({ 'transaction': 'Uniqid', 'year': 'Year', 'lobbyist_ext_id': 'LobbyistID', 'candidate_ext_id': 'CID', 'government_position': 'OfficalPos', }), #DebugEmitter(), CSVEmitter(open(outpath, 'w'), fieldnames=outfields), )
def run(self): run_recipe( CSVSource(open(self.inpath)), FieldModifier('year', lambda x: int(x) if x else None), FieldModifier('amount', lambda x: Decimal(x) if x else None), FieldModifier(( 'affiliate','filing_included_nsfs','include_in_industry_totals', 'registrant_is_firm','use'), lambda x: x == 'True'), NoneFilter(), UnicodeFilter(), CountEmitter(every=20000, log=self.log), LoaderEmitter(LobbyingLoader( source=self.inpath, description='load from denormalized CSVs', imported_by="loadlobbying (%s)" % os.getenv('LOGNAME', 'unknown'), log=self.log, )), )
def load_naccrra(): csv_path = os.path.join(settings.dataset_path('default'), 'childcarecosts.csv') run_recipe( sources.CSVSource(open(csv_path)), filters.FieldRenamer({ 'state': 'State', 'family_infant': 'Family-Infant', 'family_4': 'Family-4-Year-Old', 'family_school': 'Family-School-Age', 'center_infant': 'Center-Infant', 'center_4': 'Center-4-Year-Old', 'center_school': 'Center-School-Age', }), MongoNACCRRAEmitter(), emitters.CountEmitter(), #emitters.DebugEmitter(), error_stream = emitters.DebugEmitter(), )
def process_record(self, record): occ = record['occupation'] stats_filters = {} for fieldname in STATS_FIELDS: stats_filters[fieldname] = stats.StandardDeviation(fieldname) run_recipe( sources.SqliteSource(db_path, """SELECT * FROM locations WHERE occupation = ?""", (occ,)), filters.FieldModifier(STATS_FIELDS, to_float), Recipe(*stats_filters.values()), error_stream = emitters.DebugEmitter(), ) for fieldname, stats_filter in stats_filters.iteritems(): record['%s_stddev' % fieldname] = stats_filter.value()[0] record['%s_mean' % fieldname] = stats_filter.average() return record
def run(self): run_recipe( CSVSource(open(self.inpath)), FieldModifier('year', lambda x: int(x) if x else None), FieldModifier('amount', lambda x: Decimal(x) if x else None), FieldModifier( ('affiliate', 'filing_included_nsfs', 'include_in_industry_totals', 'registrant_is_firm', 'use'), lambda x: x == 'True'), NoneFilter(), UnicodeFilter(), CountEmitter(every=20000, log=self.log), LoaderEmitter( LobbyingLoader( source=self.inpath, description='load from denormalized CSVs', imported_by="loadlobbying (%s)" % os.getenv('LOGNAME', 'unknown'), log=self.log, )), )
def process_fec_year(year): # committees source = FixedWidthFileSource(open('%s/foiacm.dta' % year), CM_FIELDS) #sqlite = SqliteOutput('fec%s.sqlite' % year, 'committee', [f[0] for f in CM_FIELDS if f[0] != 'filler']) emit_mysql = SqlDumpEmitter(open('fec%s.sql' % year, 'a'), 'committee', [f[0] for f in CM_FIELDS if f[0] != 'filler']) run_recipe(source, emit_mysql) # candidate source = FixedWidthFileSource(open('%s/foiacn.dta' % year), CN_FIELDS) fieldremover = FieldRemover(('fillerA', 'fillerB')) #sqlite = SqliteOutput('fec%s.sqlite' % year, 'candidate', [f[0] for f in CN_FIELDS if f[0] != 'filler']) emit_mysql = SqlDumpEmitter( open('fec%s.sql' % year, 'a'), 'candidate', [f[0] for f in CN_FIELDS if not f[0].startswith('filler')]) run_recipe(source, fieldremover, emit_mysql) # contributions source = FixedWidthFileSource(open('%s/itcont.dta' % year), INDIV_FIELDS) decobolizer = FieldModifier(('amount', ), fix_cobol_number) #sqlite = SqliteOutput('fec%s.sqlite' % year, 'contribution', [f[0] for f in INDIV_FIELDS if f[0] != 'filler']) emit_mysql = SqlDumpEmitter( open('fec%s.sql' % year, 'a'), 'contribution', [f[0] for f in INDIV_FIELDS if f[0] != 'filler']) run_recipe(source, decobolizer, emit_mysql)
def load_prices(): paths = [os.path.join(CPI_ROOT, fn) for fn in ( "cu.data.3.AsizeNorthEast", "cu.data.4.AsizeNorthCentral", "cu.data.5.AsizeSouth", "cu.data.6.AsizeWest", "cu.data.7.OtherNorthEast", "cu.data.8.OtherNorthCentral", "cu.data.9.OtherSouth", "cu.data.10.OtherWest", )] headers = ('area_code','item_code','year','periodicity','period','value') run_recipe( csv.DictReader(local_files(paths), delimiter='\t'), FieldModifier('value', lambda x: x.lstrip()), SeriesIDFilter(), #SqliteEmitter(DB, 'cpi_prices', fieldnames=headers), DebugEmitter(), )
def run(self): run_recipe( CSVSource(open(self.inpath)), FieldRenamer(self.field_map), # Values are [N|A]. Convert to boolean. FieldModifier('is_amendment', \ lambda x: x == 'A'), # Convert any stray floats to integers FieldModifier('reporting_period_amount semi_annual_amount'.split(), \ lambda x: int(round(float(x))) if x else None), # Convert date formats FieldModifier('start_date end_date filing_date'.split(), \ lambda x: datetime.strptime(x, '%m/%d/%Y') if x else None), # TODO: These following two lines (and the field value) need to be thoroughly tested on the next bundling load FieldCopier({'pdf_url': 'first_image_num'}), FieldModifier('pdf_url', \ lambda x: 'http://query.nictusa.com/pdf/{0}/{1}/{1}.pdf'.format(x[-3:], x)), NoneFilter(), UnicodeFilter(), CountEmitter(every=200), #DebugEmitter(), DjangoModelEmitter('settings', Bundle) )
def load_cpi(): urls = { 'north_east': 'ftp://ftp.bls.gov/pub/time.series/cu/cu.data.3.AsizeNorthEast', 'north_central': 'ftp://ftp.bls.gov/pub/time.series/cu/cu.data.4.AsizeNorthCentral', 'south': 'ftp://ftp.bls.gov/pub/time.series/cu/cu.data.5.AsizeSouth', 'west': 'ftp://ftp.bls.gov/pub/time.series/cu/cu.data.6.AsizeWest', } url = urls['west'] headers = ('series_id','survey_abbr','seasonal_code','periodicity_code', 'area_code','item_code','year','period','value','footnote_codes') reader = remote_files(*urls.values(), headers=True) run_recipe( #CSVSource(utils.RemoteFile(url), delimiter='\t'), CSVSource(reader, delimiter='\t'), FieldModifier(('series_id','value'), str.strip), SeriesIDFilter('series_id'), ValueConditionalFilter('year', '2008'), CSVEmitter(open('bls.csv', 'w'), headers) )
def run(self): run_recipe( CSVSource(open(self.inpath)), FieldRenamer(self.field_map), # Values are [N|A]. Convert to boolean. FieldModifier('is_amendment', \ lambda x: x == 'A'), # Convert any stray floats to integers FieldModifier('reporting_period_amount semi_annual_amount'.split(), \ lambda x: int(round(float(x.replace('$','').replace(',','')))) if x else None), # Convert date formats FieldModifier('start_date end_date filing_date'.split(), \ lambda x: datetime.strptime(x, '%m/%d/%Y') if x else None), # TODO: These following two lines (and the field value) need to be thoroughly tested on the next bundling load FieldCopier({'pdf_url': 'first_image_num'}), FieldModifier('pdf_url', \ lambda x: 'http://query.nictusa.com/pdf/{0}/{1}/{1}.pdf'.format(x[-3:], x)), NoneFilter(), UnicodeFilter(), CountEmitter(every=200), #DebugEmitter(), SimpleDjangoModelEmitter(Bundle) )
def run(self): run_recipe( CSVSource(open(self.inpath)), FieldRenamer(self.field_map), # Values are [N|A]. Convert to boolean. FieldModifier("is_amendment", lambda x: x == "A"), # Convert any stray floats to integers FieldModifier( "reporting_period_amount semi_annual_amount".split(), lambda x: int(round(float(x.replace("$", "").replace(",", "")))) if x else None, ), # Convert date formats FieldModifier( "start_date end_date filing_date".split(), lambda x: datetime.strptime(x, "%m/%d/%Y") if x else None ), # TODO: These following two lines (and the field value) need to be thoroughly tested on the next bundling load FieldCopier({"pdf_url": "first_image_num"}), FieldModifier("pdf_url", lambda x: "http://query.nictusa.com/pdf/{0}/{1}/{1}.pdf".format(x[-3:], x)), NoneFilter(), UnicodeFilter(), CountEmitter(every=200), # DebugEmitter(), SimpleDjangoModelEmitter(Bundle), )
from saucebrush.filters import Splitter, PhoneNumberCleaner, FieldMerger, FieldAdder from saucebrush.emitters import DebugEmitter import operator from itertools import count import saucebrush data = [{ 'person': { 'firstname': 'James', 'lastname': 'Turk' }, 'phones': [{ 'phone': '222-222-2222' }, { 'phone': '(202) 333-3321' }] }] namemerger = FieldMerger({'name': ('firstname', 'lastname')}, lambda x, y: ' '.join((x, y))) phonecleaner = PhoneNumberCleaner(('phone', )) splitter = Splitter({'person': [namemerger], 'phones': [phonecleaner]}) ider = FieldAdder('id', count()) saucebrush.run_recipe(data, ider, splitter, DebugEmitter())
query = """ SELECT p.ID, p.post_author, u.user_login, p.post_date, p.post_date_gmt, p.post_modified, p.post_modified_gmt, p.post_content, p.post_title, p.post_category, p.post_excerpt, p.guid, p.post_type FROM oh_posts p INNER JOIN oh_users u ON p.post_author = u.ID INNER JOIN oh_term_relationships tr ON p.ID = tr.object_id INNER JOIN oh_term_taxonomy tt ON tr.term_taxonomy_id = tt.term_taxonomy_id INNER JOIN oh_terms t ON tt.term_id = t.term_id WHERE p.post_status = 'publish' AND p.post_type = 'post' AND (t.name = 'act' or t.name = 'The Day in Transparency') ORDER BY p.post_date DESC """ mongo = Connection() saucebrush.run_recipe( MySQLSource(conn, query), MetaFilter(conn), TagFilter(conn), ContentFilter(), FieldModifier(('post_content', 'post_excerpt'), lambda x: Binary(x)), MongoDBEmitter('openhouse', 'blog', drop_collection=True, conn=mongo), #DebugEmitter(), ) for d in mongo['openhouse']['blog'].find(): print "------", d['post_title'] conn.close()
def test_run_recipe(self): saver = Saver() run_recipe([1, 2], saver) self.assertEqual(saver.saved, [1, 2])