def test_creates_virtual_tables_for_partition_with_segment_without_errors( self): fs = fsopendir('temp://') def gen(): # generate header yield ['col1', 'col2'] # generate rows yield [0, 0] yield [1, 1] mprows = MPRowsFile(fs, 'example.com/simple-0.1.3/1.mpr') mprows.load_rows(GeneratorSource(SourceSpec('foobar'), gen())) # create virtual tables. This should not raise an error. # connection = apsw.Connection(':memory:') try: add_partition(connection, mprows, 'vid1') except Exception as exc: raise AssertionError( 'partition adding unexpectadly failed with {} error.'.format( exc)) # check selected rows # cursor = connection.cursor() result = cursor.execute('SELECT * FROM {}'.format('vid1')).fetchall() self.assertEqual(result, [(0, 0), (1, 1)])
def test_creates_virtual_tables_for_partition_with_segment_without_errors(self): fs = fsopendir('temp://') def gen(): # generate header yield ['col1', 'col2'] # generate rows yield [0, 0] yield [1, 1] mprows = MPRowsFile(fs, 'example.com/simple-0.1.3/1.mpr') mprows.load_rows(GeneratorSource(SourceSpec('foobar'), gen())) # create virtual tables. This should not raise an error. # connection = apsw.Connection(':memory:') try: add_partition(connection, mprows, 'vid1') except Exception as exc: raise AssertionError('partition adding unexpectadly failed with {} error.'.format(exc)) # check selected rows # cursor = connection.cursor() result = cursor.execute('SELECT * FROM {}'.format('vid1')).fetchall() self.assertEqual(result, [(0, 0), (1, 1)])
def test_load_check_headers(self): """Just check that all of the sources can be loaded without exceptions""" cache_fs = fsopendir('temp://') headers = { 'mz_with_zip_xl': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')], 'mz_no_zip': [u('id'), u('uuid'), u('int'), u('float')], 'namesu8': [u('origin_english'), u('name_english'), u('origin_native'), u('name_native')], 'sf_zip': [u('id'), u('uuid'), u('int'), u('float')], 'simple': [u('id'), u('uuid'), u('int'), u('float')], 'csv_no_csv': [u('id'), u('uuid'), u('int'), u('float')], 'mz_with_zip': [u('id'), u('uuid'), u('int'), u('float')], 'rpeople': [u('name'), u('size')], 'rent07': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')], 'simple_fixed': [u('id'), u('uuid'), u('int'), u('float')], 'altname': [u('id'), u('foo'), u('bar'), u('baz')], 'rentcsv': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')], 'renttab': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')], 'multiexcel': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')], 'rent97': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')] } for source_name, spec in self.sources.items(): print(source_name) s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, spec.name) if f.exists: f.remove() f.load_rows(s) with f.reader as r: if spec.name in headers: self.assertEqual(headers[spec.name], r.headers)
def test_row_load_intuit(self): """Check that the sources can be loaded and analyzed without exceptions and that the guesses for headers and start are as expected""" from itertools import islice cache_fs = fsopendir('temp://') cache_fs.makedir('/mpr') # cache_fs = fsopendir('/tmp/ritest/') sources = self.load_sources('sources-non-std-headers.csv') for source_name, spec in sources.items(): s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, '/mpr/' + source_name) if f.exists: f.remove() f.load_rows(s, intuit_type=False, run_stats=False, limit=500) self.assertEqual(f.info['data_start_row'], spec.expect_start) with f.reader as r: # First row, marked with metadata, that is marked as a data row m1, row1 = next( six.moves.filter(lambda e: e[0][2] == 'D', r.meta_raw)) with f.reader as r: # First row row2 = next(r.rows) with f.reader as r: # First row proxy row3 = next(iter(r)).row self.assertEqual(row1, row2) self.assertEqual(row1, row3) with f.reader as r: raw_rows = list(islice(r.raw, None, 40)) self.assertEqual(row2, raw_rows[f.info['data_start_row']])
def test_row_load_intuit(self): """Check that the sources can be loaded and analyzed without exceptions and that the guesses for headers and start are as expected""" from itertools import islice cache_fs = fsopendir('temp://') cache_fs.makedir('/mpr') # cache_fs = fsopendir('/tmp/ritest/') sources = self.load_sources('sources-non-std-headers.csv') for source_name, spec in sources.items(): s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, '/mpr/'+source_name) if f.exists: f.remove() f.load_rows(s, intuit_type=False, run_stats=False, limit=500) self.assertEqual(f.info['data_start_row'], spec.expect_start) with f.reader as r: # First row, marked with metadata, that is marked as a data row m1, row1 = next(six.moves.filter(lambda e: e[0][2] == 'D', r.meta_raw)) with f.reader as r: # First row row2 = next(r.rows) with f.reader as r: # First row proxy row3 = next(iter(r)).row self.assertEqual(row1, row2) self.assertEqual(row1, row3) with f.reader as r: raw_rows = list(islice(r.raw, None, 40)) self.assertEqual(row2, raw_rows[f.info['data_start_row']])
def test_full_load(self): """Just check that all of the sources can be loaded without exceptions""" cache_fs = fsopendir('temp://') for source_name, spec in self.sources.items(): s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, spec.name) if f.exists: f.remove() f.load_rows(s) with f.reader as r: self.assertTrue(len(r.headers) > 0)
def test_bad_row_intuition(self): from ambry_sources.mpf import MPRowsFile from ambry_sources.sources.spec import SourceSpec cache_fs = fsopendir('temp://') spec = SourceSpec('http://public.source.civicknowledge.com/example.com/sources/simple-example.csv', name='simple') s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, spec.name) if f.exists: f.remove() f.load_rows(s) self.assertEqual(10001, f.reader.info['data_end_row'])
def test_selects_correct_rows_from_many_mprows(self): fs = fsopendir('temp://') header = ['col1', 'col2'] # create 3 mprows files. # rows1 = [(0, 0), (1, 1)] mprows1 = MPRowsFile(fs, 'vid1') mprows1.load_rows(self._get_generator_source(header, rows1)) rows2 = [(2, 2), (3, 3)] mprows2 = MPRowsFile(fs, 'vid2') mprows2.load_rows(self._get_generator_source(header, rows2)) rows3 = [(4, 4), (5, 5)] mprows3 = MPRowsFile(fs, 'vid3') mprows3.load_rows(self._get_generator_source(header, rows3)) # create virtual tables for all mprows # connection = apsw.Connection(':memory:') add_partition(connection, mprows1, 'vid1') add_partition(connection, mprows2, 'vid2') add_partition(connection, mprows3, 'vid3') # check rows of all added mprows. # cursor = connection.cursor() query_tmpl = 'SELECT * FROM {};' # check rows of the first file. # query = query_tmpl.format('vid1') result = cursor.execute(query).fetchall() self.assertEqual(result, rows1) # check rows of the second mprows file. # query = query_tmpl.format('vid2') result = cursor.execute(query).fetchall() self.assertEqual(result, rows2) # check rows of the third mprows file. # query = query_tmpl.format('vid3') result = cursor.execute(query).fetchall() self.assertEqual(result, rows3)
def test_bad_row_intuition(self): from ambry_sources.mpf import MPRowsFile from ambry_sources.sources.spec import SourceSpec, ColumnSpec cache_fs = fsopendir('temp://') spec = SourceSpec( 'http://www2.census.gov/acs2009_1yr/summaryfile/Entire_States/Arizona.zip', file='g2009.*\.txt', filetype='fixed', name='geofile', encoding='latin1', ) spec.columns = [ ColumnSpec(position=1, width=6, name='fileid', start=1), ColumnSpec(position=2, width=2, name='stusab', start=7), ColumnSpec(position=3, width=3, name='sumlevel', start=9), ColumnSpec(position=4, width=2, name='component', start=12), ColumnSpec(position=5, width=7, name='logrecno', start=14), ColumnSpec(position=6, width=1, name='us', start=21), ColumnSpec(position=7, width=1, name='region', start=22), ColumnSpec(position=8, width=1, name='division', start=23), ColumnSpec(position=9, width=2, name='statece', start=24), ColumnSpec(position=10, width=2, name='state', start=26), ColumnSpec(position=11, width=3, name='county', start=28), ColumnSpec(position=12, width=5, name='cousub', start=31), ColumnSpec(position=13, width=5, name='place', start=36), ColumnSpec(position=14, width=6, name='tract', start=41), ColumnSpec(position=15, width=1, name='blkgrp', start=47), ColumnSpec(position=16, width=5, name='concit', start=48), ColumnSpec(position=17, width=4, name='aianhh', start=53), ColumnSpec(position=18, width=5, name='aianhhfp', start=57), ColumnSpec(position=19, width=1, name='aihhtli', start=62), ColumnSpec(position=20, width=3, name='aitsce', start=63), ColumnSpec(position=21, width=5, name='aits', start=66), ColumnSpec(position=22, width=5, name='anrc', start=71), ColumnSpec(position=23, width=5, name='cbsa', start=76), ColumnSpec(position=24, width=3, name='csa', start=81), ColumnSpec(position=25, width=5, name='metdiv', start=84), ColumnSpec(position=26, width=1, name='macc', start=89), ColumnSpec(position=27, width=1, name='memi', start=90), ColumnSpec(position=28, width=5, name='necta', start=91), ColumnSpec(position=29, width=3, name='cnecta', start=96), ColumnSpec(position=30, width=5, name='nectadiv', start=99), ColumnSpec(position=31, width=5, name='ua', start=104), ColumnSpec(position=33, width=2, name='cdcurr', start=114), ColumnSpec(position=34, width=3, name='sldu', start=116), ColumnSpec(position=35, width=3, name='sldl', start=119), ColumnSpec(position=39, width=5, name='submcd', start=136), ColumnSpec(position=40, width=5, name='sdelm', start=141), ColumnSpec(position=41, width=5, name='sdsec', start=146), ColumnSpec(position=42, width=5, name='sduni', start=151), ColumnSpec(position=43, width=1, name='ur', start=156), ColumnSpec(position=44, width=1, name='pci', start=157), ColumnSpec(position=47, width=5, name='puma5', start=169), ColumnSpec(position=49, width=40, name='geoid', start=179), ColumnSpec(position=50, width=200, name='name', start=219) ] s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, spec.name) if f.exists: f.remove() f.load_rows(s) self.assertEqual(119, f.reader.info['data_end_row'])
def test_load_check_headers(self): """Just check that all of the sources can be loaded without exceptions""" cache_fs = fsopendir('temp://') headers = { 'mz_with_zip_xl': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv') ], 'mz_no_zip': [u('id'), u('uuid'), u('int'), u('float')], 'namesu8': [ u('origin_english'), u('name_english'), u('origin_native'), u('name_native') ], 'sf_zip': [u('id'), u('uuid'), u('int'), u('float')], 'simple': [u('id'), u('uuid'), u('int'), u('float')], 'csv_no_csv': [u('id'), u('uuid'), u('int'), u('float')], 'mz_with_zip': [u('id'), u('uuid'), u('int'), u('float')], 'rpeople': [u('name'), u('size')], 'rent07': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv') ], 'simple_fixed': [u('id'), u('uuid'), u('int'), u('float')], 'altname': [u('id'), u('foo'), u('bar'), u('baz')], 'rentcsv': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv') ], 'renttab': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv') ], 'multiexcel': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv') ], 'rent97': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv') ] } for source_name, spec in self.sources.items(): print(source_name) s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, spec.name) if f.exists: f.remove() f.load_rows(s) with f.reader as r: if spec.name in headers: self.assertEqual(headers[spec.name], r.headers)
def test_bad_row_intuition(self): from ambry_sources.mpf import MPRowsFile from ambry_sources.sources.spec import SourceSpec, ColumnSpec cache_fs = fsopendir('temp://') spec = SourceSpec('http://www2.census.gov/acs2009_1yr/summaryfile/Entire_States/Arizona.zip', file='g2009.*\.txt', filetype='fixed', name='geofile', encoding='latin1', ) spec.columns = [ColumnSpec(position=1, width=6, name='fileid', start=1), ColumnSpec(position=2, width=2, name='stusab', start=7), ColumnSpec(position=3, width=3, name='sumlevel', start=9), ColumnSpec(position=4, width=2, name='component', start=12), ColumnSpec(position=5, width=7, name='logrecno', start=14), ColumnSpec(position=6, width=1, name='us', start=21), ColumnSpec(position=7, width=1, name='region', start=22), ColumnSpec(position=8, width=1, name='division', start=23), ColumnSpec(position=9, width=2, name='statece', start=24), ColumnSpec(position=10, width=2, name='state', start=26), ColumnSpec(position=11, width=3, name='county', start=28), ColumnSpec(position=12, width=5, name='cousub', start=31), ColumnSpec(position=13, width=5, name='place', start=36), ColumnSpec(position=14, width=6, name='tract', start=41), ColumnSpec(position=15, width=1, name='blkgrp', start=47), ColumnSpec(position=16, width=5, name='concit', start=48), ColumnSpec(position=17, width=4, name='aianhh', start=53), ColumnSpec(position=18, width=5, name='aianhhfp', start=57), ColumnSpec(position=19, width=1, name='aihhtli', start=62), ColumnSpec(position=20, width=3, name='aitsce', start=63), ColumnSpec(position=21, width=5, name='aits', start=66), ColumnSpec(position=22, width=5, name='anrc', start=71), ColumnSpec(position=23, width=5, name='cbsa', start=76), ColumnSpec(position=24, width=3, name='csa', start=81), ColumnSpec(position=25, width=5, name='metdiv', start=84), ColumnSpec(position=26, width=1, name='macc', start=89), ColumnSpec(position=27, width=1, name='memi', start=90), ColumnSpec(position=28, width=5, name='necta', start=91), ColumnSpec(position=29, width=3, name='cnecta', start=96), ColumnSpec(position=30, width=5, name='nectadiv', start=99), ColumnSpec(position=31, width=5, name='ua', start=104), ColumnSpec(position=33, width=2, name='cdcurr', start=114), ColumnSpec(position=34, width=3, name='sldu', start=116), ColumnSpec(position=35, width=3, name='sldl', start=119), ColumnSpec(position=39, width=5, name='submcd', start=136), ColumnSpec(position=40, width=5, name='sdelm', start=141), ColumnSpec(position=41, width=5, name='sdsec', start=146), ColumnSpec(position=42, width=5, name='sduni', start=151), ColumnSpec(position=43, width=1, name='ur', start=156), ColumnSpec(position=44, width=1, name='pci', start=157), ColumnSpec(position=47, width=5, name='puma5', start=169), ColumnSpec(position=49, width=40, name='geoid', start=179), ColumnSpec(position=50, width=200, name='name', start=219)] s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, spec.name) if f.exists: f.remove() f.load_rows(s) self.assertEqual(119, f.reader.info['data_end_row'])