def write_large_blocks(): df = MPRowsFile(fs, 'foobar') if df.exists: df.remove() with Timer() as t, df.writer as w: w.headers = headers w.insert_rows(rows) print('MSGPack write L', float(N) / t.elapsed, w.n_rows)
def write_small_blocks(): df = MPRowsFile(fs, 'foobar') if df.exists: df.remove() with Timer() as t, df.writer as w: for i in range(N): w.headers = headers w.insert_row(rows[i]) print('MSGPack write S', float(N) / t.elapsed, w.n_rows)
def test_load_check_headers(self): """Just check that all of the sources can be loaded without exceptions""" cache_fs = fsopendir('temp://') headers = { 'mz_with_zip_xl': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')], 'mz_no_zip': [u('id'), u('uuid'), u('int'), u('float')], 'namesu8': [u('origin_english'), u('name_english'), u('origin_native'), u('name_native')], 'sf_zip': [u('id'), u('uuid'), u('int'), u('float')], 'simple': [u('id'), u('uuid'), u('int'), u('float')], 'csv_no_csv': [u('id'), u('uuid'), u('int'), u('float')], 'mz_with_zip': [u('id'), u('uuid'), u('int'), u('float')], 'rpeople': [u('name'), u('size')], 'rent07': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')], 'simple_fixed': [u('id'), u('uuid'), u('int'), u('float')], 'altname': [u('id'), u('foo'), u('bar'), u('baz')], 'rentcsv': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')], 'renttab': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')], 'multiexcel': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')], 'rent97': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')] } for source_name, spec in self.sources.items(): print(source_name) s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, spec.name) if f.exists: f.remove() f.load_rows(s) with f.reader as r: if spec.name in headers: self.assertEqual(headers[spec.name], r.headers)
def test_row_load_intuit(self): """Check that the sources can be loaded and analyzed without exceptions and that the guesses for headers and start are as expected""" from itertools import islice cache_fs = fsopendir('temp://') cache_fs.makedir('/mpr') # cache_fs = fsopendir('/tmp/ritest/') sources = self.load_sources('sources-non-std-headers.csv') for source_name, spec in sources.items(): s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, '/mpr/' + source_name) if f.exists: f.remove() f.load_rows(s, intuit_type=False, run_stats=False, limit=500) self.assertEqual(f.info['data_start_row'], spec.expect_start) with f.reader as r: # First row, marked with metadata, that is marked as a data row m1, row1 = next( six.moves.filter(lambda e: e[0][2] == 'D', r.meta_raw)) with f.reader as r: # First row row2 = next(r.rows) with f.reader as r: # First row proxy row3 = next(iter(r)).row self.assertEqual(row1, row2) self.assertEqual(row1, row3) with f.reader as r: raw_rows = list(islice(r.raw, None, 40)) self.assertEqual(row2, raw_rows[f.info['data_start_row']])
def test_row_load_intuit(self): """Check that the sources can be loaded and analyzed without exceptions and that the guesses for headers and start are as expected""" from itertools import islice cache_fs = fsopendir('temp://') cache_fs.makedir('/mpr') # cache_fs = fsopendir('/tmp/ritest/') sources = self.load_sources('sources-non-std-headers.csv') for source_name, spec in sources.items(): s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, '/mpr/'+source_name) if f.exists: f.remove() f.load_rows(s, intuit_type=False, run_stats=False, limit=500) self.assertEqual(f.info['data_start_row'], spec.expect_start) with f.reader as r: # First row, marked with metadata, that is marked as a data row m1, row1 = next(six.moves.filter(lambda e: e[0][2] == 'D', r.meta_raw)) with f.reader as r: # First row row2 = next(r.rows) with f.reader as r: # First row proxy row3 = next(iter(r)).row self.assertEqual(row1, row2) self.assertEqual(row1, row3) with f.reader as r: raw_rows = list(islice(r.raw, None, 40)) self.assertEqual(row2, raw_rows[f.info['data_start_row']])
def test_full_load(self): """Just check that all of the sources can be loaded without exceptions""" cache_fs = fsopendir('temp://') for source_name, spec in self.sources.items(): s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, spec.name) if f.exists: f.remove() f.load_rows(s) with f.reader as r: self.assertTrue(len(r.headers) > 0)
def test_bad_row_intuition(self): from ambry_sources.mpf import MPRowsFile from ambry_sources.sources.spec import SourceSpec cache_fs = fsopendir('temp://') spec = SourceSpec('http://public.source.civicknowledge.com/example.com/sources/simple-example.csv', name='simple') s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, spec.name) if f.exists: f.remove() f.load_rows(s) self.assertEqual(10001, f.reader.info['data_end_row'])
def test_bad_row_intuition(self): from ambry_sources.mpf import MPRowsFile from ambry_sources.sources.spec import SourceSpec, ColumnSpec cache_fs = fsopendir('temp://') spec = SourceSpec( 'http://www2.census.gov/acs2009_1yr/summaryfile/Entire_States/Arizona.zip', file='g2009.*\.txt', filetype='fixed', name='geofile', encoding='latin1', ) spec.columns = [ ColumnSpec(position=1, width=6, name='fileid', start=1), ColumnSpec(position=2, width=2, name='stusab', start=7), ColumnSpec(position=3, width=3, name='sumlevel', start=9), ColumnSpec(position=4, width=2, name='component', start=12), ColumnSpec(position=5, width=7, name='logrecno', start=14), ColumnSpec(position=6, width=1, name='us', start=21), ColumnSpec(position=7, width=1, name='region', start=22), ColumnSpec(position=8, width=1, name='division', start=23), ColumnSpec(position=9, width=2, name='statece', start=24), ColumnSpec(position=10, width=2, name='state', start=26), ColumnSpec(position=11, width=3, name='county', start=28), ColumnSpec(position=12, width=5, name='cousub', start=31), ColumnSpec(position=13, width=5, name='place', start=36), ColumnSpec(position=14, width=6, name='tract', start=41), ColumnSpec(position=15, width=1, name='blkgrp', start=47), ColumnSpec(position=16, width=5, name='concit', start=48), ColumnSpec(position=17, width=4, name='aianhh', start=53), ColumnSpec(position=18, width=5, name='aianhhfp', start=57), ColumnSpec(position=19, width=1, name='aihhtli', start=62), ColumnSpec(position=20, width=3, name='aitsce', start=63), ColumnSpec(position=21, width=5, name='aits', start=66), ColumnSpec(position=22, width=5, name='anrc', start=71), ColumnSpec(position=23, width=5, name='cbsa', start=76), ColumnSpec(position=24, width=3, name='csa', start=81), ColumnSpec(position=25, width=5, name='metdiv', start=84), ColumnSpec(position=26, width=1, name='macc', start=89), ColumnSpec(position=27, width=1, name='memi', start=90), ColumnSpec(position=28, width=5, name='necta', start=91), ColumnSpec(position=29, width=3, name='cnecta', start=96), ColumnSpec(position=30, width=5, name='nectadiv', start=99), ColumnSpec(position=31, width=5, name='ua', start=104), ColumnSpec(position=33, width=2, name='cdcurr', start=114), ColumnSpec(position=34, width=3, name='sldu', start=116), ColumnSpec(position=35, width=3, name='sldl', start=119), ColumnSpec(position=39, width=5, name='submcd', start=136), ColumnSpec(position=40, width=5, name='sdelm', start=141), ColumnSpec(position=41, width=5, name='sdsec', start=146), ColumnSpec(position=42, width=5, name='sduni', start=151), ColumnSpec(position=43, width=1, name='ur', start=156), ColumnSpec(position=44, width=1, name='pci', start=157), ColumnSpec(position=47, width=5, name='puma5', start=169), ColumnSpec(position=49, width=40, name='geoid', start=179), ColumnSpec(position=50, width=200, name='name', start=219) ] s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, spec.name) if f.exists: f.remove() f.load_rows(s) self.assertEqual(119, f.reader.info['data_end_row'])
def test_load_check_headers(self): """Just check that all of the sources can be loaded without exceptions""" cache_fs = fsopendir('temp://') headers = { 'mz_with_zip_xl': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv') ], 'mz_no_zip': [u('id'), u('uuid'), u('int'), u('float')], 'namesu8': [ u('origin_english'), u('name_english'), u('origin_native'), u('name_native') ], 'sf_zip': [u('id'), u('uuid'), u('int'), u('float')], 'simple': [u('id'), u('uuid'), u('int'), u('float')], 'csv_no_csv': [u('id'), u('uuid'), u('int'), u('float')], 'mz_with_zip': [u('id'), u('uuid'), u('int'), u('float')], 'rpeople': [u('name'), u('size')], 'rent07': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv') ], 'simple_fixed': [u('id'), u('uuid'), u('int'), u('float')], 'altname': [u('id'), u('foo'), u('bar'), u('baz')], 'rentcsv': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv') ], 'renttab': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv') ], 'multiexcel': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv') ], 'rent97': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv') ] } for source_name, spec in self.sources.items(): print(source_name) s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, spec.name) if f.exists: f.remove() f.load_rows(s) with f.reader as r: if spec.name in headers: self.assertEqual(headers[spec.name], r.headers)
def test_bad_row_intuition(self): from ambry_sources.mpf import MPRowsFile from ambry_sources.sources.spec import SourceSpec, ColumnSpec cache_fs = fsopendir('temp://') spec = SourceSpec('http://www2.census.gov/acs2009_1yr/summaryfile/Entire_States/Arizona.zip', file='g2009.*\.txt', filetype='fixed', name='geofile', encoding='latin1', ) spec.columns = [ColumnSpec(position=1, width=6, name='fileid', start=1), ColumnSpec(position=2, width=2, name='stusab', start=7), ColumnSpec(position=3, width=3, name='sumlevel', start=9), ColumnSpec(position=4, width=2, name='component', start=12), ColumnSpec(position=5, width=7, name='logrecno', start=14), ColumnSpec(position=6, width=1, name='us', start=21), ColumnSpec(position=7, width=1, name='region', start=22), ColumnSpec(position=8, width=1, name='division', start=23), ColumnSpec(position=9, width=2, name='statece', start=24), ColumnSpec(position=10, width=2, name='state', start=26), ColumnSpec(position=11, width=3, name='county', start=28), ColumnSpec(position=12, width=5, name='cousub', start=31), ColumnSpec(position=13, width=5, name='place', start=36), ColumnSpec(position=14, width=6, name='tract', start=41), ColumnSpec(position=15, width=1, name='blkgrp', start=47), ColumnSpec(position=16, width=5, name='concit', start=48), ColumnSpec(position=17, width=4, name='aianhh', start=53), ColumnSpec(position=18, width=5, name='aianhhfp', start=57), ColumnSpec(position=19, width=1, name='aihhtli', start=62), ColumnSpec(position=20, width=3, name='aitsce', start=63), ColumnSpec(position=21, width=5, name='aits', start=66), ColumnSpec(position=22, width=5, name='anrc', start=71), ColumnSpec(position=23, width=5, name='cbsa', start=76), ColumnSpec(position=24, width=3, name='csa', start=81), ColumnSpec(position=25, width=5, name='metdiv', start=84), ColumnSpec(position=26, width=1, name='macc', start=89), ColumnSpec(position=27, width=1, name='memi', start=90), ColumnSpec(position=28, width=5, name='necta', start=91), ColumnSpec(position=29, width=3, name='cnecta', start=96), ColumnSpec(position=30, width=5, name='nectadiv', start=99), ColumnSpec(position=31, width=5, name='ua', start=104), ColumnSpec(position=33, width=2, name='cdcurr', start=114), ColumnSpec(position=34, width=3, name='sldu', start=116), ColumnSpec(position=35, width=3, name='sldl', start=119), ColumnSpec(position=39, width=5, name='submcd', start=136), ColumnSpec(position=40, width=5, name='sdelm', start=141), ColumnSpec(position=41, width=5, name='sdsec', start=146), ColumnSpec(position=42, width=5, name='sduni', start=151), ColumnSpec(position=43, width=1, name='ur', start=156), ColumnSpec(position=44, width=1, name='pci', start=157), ColumnSpec(position=47, width=5, name='puma5', start=169), ColumnSpec(position=49, width=40, name='geoid', start=179), ColumnSpec(position=50, width=200, name='name', start=219)] s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, spec.name) if f.exists: f.remove() f.load_rows(s) self.assertEqual(119, f.reader.info['data_end_row'])