def test_index(self): if sys.platform == 'win32': # TODO: this test fails on Windows because of file lock problems return num_records = 100000 expected_values = [((num_records-1)*5) + x for x in range(5)] data = StringIO() generate_dataset(data, IntIter(), ',', num_records) # test explicit index building adapter = iopro.text_adapter(data, delimiter=',', field_names=False, infer_types=False) adapter.set_field_types({0:'u4',1:'u4',2:'u4',3:'u4',4:'u4'}) adapter.create_index() self.assert_equality(adapter[0].item(), tuple([(0*5) + x for x in range(5)])) self.assert_equality(adapter[10].item(), tuple([(10*5) + x for x in range(5)])) self.assert_equality(adapter[100].item(), tuple([(100*5) + x for x in range(5)])) self.assert_equality(adapter[1000].item(), tuple([(1000*5) + x for x in range(5)])) self.assert_equality(adapter[10000].item(), tuple([(10000*5) + x for x in range(5)])) self.assert_equality(adapter[num_records - 1].item(), tuple([((num_records - 1)*5) + x for x in range(5)])) #self.assert_equality(adapter[-1].item(), tuple(expected_values)) # test implicitly creating disk index on the fly if os.path.exists('test.idx'): os.remove('test.idx') data.seek(0) adapter = iopro.text_adapter(data, delimiter=',', field_names=False, infer_types=False, index_name='test.idx') adapter.set_field_types({0:'u4',1:'u4',2:'u4',3:'u4',4:'u4'}) adapter.to_array() self.assert_equality(adapter[0].item(), tuple([(0*5) + x for x in range(5)])) self.assert_equality(adapter[10].item(), tuple([(10*5) + x for x in range(5)])) self.assert_equality(adapter[100].item(), tuple([(100*5) + x for x in range(5)])) self.assert_equality(adapter[1000].item(), tuple([(1000*5) + x for x in range(5)])) self.assert_equality(adapter[10000].item(), tuple([(10000*5) + x for x in range(5)])) self.assert_equality(adapter[num_records - 1].item(), tuple([((num_records - 1)*5) + x for x in range(5)])) #self.assert_equality(adapter[-1].item(), tuple(expected_values)) adapter.close() # test loading disk index data.seek(0) adapter2 = iopro.text_adapter(data, delimiter=',', field_names=False, infer_types=False, index_name='test.idx') adapter2.set_field_types({0:'u4',1:'u4',2:'u4',3:'u4',4:'u4'}) self.assert_equality(adapter2[0].item(), tuple([(0*5) + x for x in range(5)])) self.assert_equality(adapter2[10].item(), tuple([(10*5) + x for x in range(5)])) self.assert_equality(adapter2[100].item(), tuple([(100*5) + x for x in range(5)])) self.assert_equality(adapter2[1000].item(), tuple([(1000*5) + x for x in range(5)])) self.assert_equality(adapter2[10000].item(), tuple([(10000*5) + x for x in range(5)])) self.assert_equality(adapter2[num_records - 1].item(), tuple([((num_records - 1)*5) + x for x in range(5)])) #self.assert_equality(adapter2[-1].item(), tuple(expected_values)) adapter.close() os.remove('test.idx')
def test_missing_fill_values(self): data = StringIO() generate_dataset(data, MissingValuesIter(), ',', self.num_records) adapter = iopro.text_adapter(data, delimiter=',', field_names=False, infer_types=False) adapter.set_field_types({'f0':'u4', 1:'u4', 2:'u4', 3:'u4', 'f4':'u4'}) adapter.set_missing_values({0:['NA', 'NaN'], 'f4':['xx','inf']}) adapter.set_fill_values({0:99, 4:999}) array = adapter[:] self.assert_equality(array.size, self.num_records) record = [x for x in range(0, 5)] for i in range(0, self.num_records): if i % 4 == 0 or i % 4 == 1: record[0] = 99 record[4] = 999 else: record[0] = record[1] - 1 record[4] = record[3] + 1 self.assert_equality(array[i].item(), tuple(record)) record[1] += 5 record[2] += 5 record[3] += 5 data.seek(0) adapter = iopro.text_adapter(data, delimiter=',', field_names=False, infer_types=True) adapter.set_missing_values({0:['NA', 'NaN'], 4:['xx','inf']}) array = adapter[:] self.assert_equality(array.size, self.num_records) record = [x for x in range(0, 5)] for i in range(0, self.num_records): if i % 4 == 0 or i % 4 == 1: record[0] = 0 record[4] = 0 else: record[0] = record[1] - 1 record[4] = record[3] + 1 self.assert_equality(array[i].item(), tuple(record)) record[1] += 5 record[2] += 5 record[3] += 5 # Test missing field data = StringIO('1,2,3\n4,5\n7,8,9') adapter = iopro.text_adapter(data, field_names=False) adapter.field_types = {0:'O', 1:'O', 2:'O'} adapter.set_fill_values({0:np.nan, 1:np.nan, 2:np.nan}) array = adapter[:] # NumPy assert_array_equal no longer supports mixed O/nan types expected = [('1','2','3'),('4','5',np.nan),('7','8','9')] self.assert_equality(array.tolist(), expected)
def test_quoted_whitespace(self): data = StringIO('"1 ","2 ","3 "\n') adapter = iopro.text_adapter(data, field_names=False) adapter.set_field_types({0:'S3', 1:'S3', 2:'S3'}) assert_array_equal(adapter[:], np.array([('1 ', '2 ', '3 ')], dtype='S3,S3,S3')) data = StringIO('"\t1\t"\t"\t2\t"\t"\t3\t"\n') adapter = iopro.text_adapter(data, field_names=False, delimiter='\t') adapter.set_field_types({0:'S3', 1:'S3', 2:'S3'}) assert_array_equal(adapter[:], np.array([('\t1\t', '\t2\t', '\t3\t')], dtype='S3,S3,S3'))
def test_utf8_parsing(self): # test single byte character data = io.BytesIO(u'1,2,\u0033'.encode('utf_8')) adapter = iopro.text_adapter(data, field_names=False) expected = np.array([('1', '2', '3')], dtype='u8,u8,u8') assert_array_equal(adapter[:], expected) # test multibyte character data = io.BytesIO(u'1,2,\u2092'.encode('utf_8')) adapter = iopro.text_adapter(data, field_names=False) expected = np.array([('1', '2', u'\u2092')], dtype='u8,u8,O') assert_array_equal(adapter[:], expected)
def test_comments(self): data = StringIO('1,2,3\n#4,5,6') adapter = iopro.text_adapter(data, field_names=False) array = adapter[:] self.assert_equality(array.size, 1) self.assert_equality(array[0].item(), (1,2,3)) data = StringIO('1,2,3\n#4,5,6') adapter = iopro.text_adapter(data, field_names=False, comment=None) array = adapter[:] self.assert_equality(array.size, 2) self.assert_equality(array[0].item(), ('1',2,3)) self.assert_equality(array[1].item(), ('#4',5,6))
def test_string_parsing(self): data = StringIO('1,2,3\n') adapter = iopro.text_adapter(data, field_names=False) adapter.set_field_types({0:'S5', 1:'S5', 2:'S5'}) assert_array_equal(adapter[:], np.array([('1', '2', '3')], dtype='S5,S5,S5')) data = io.StringIO(u'1,2,3\n') adapter = iopro.text_adapter(data, field_names=False) adapter.set_field_types({0:'S5', 1:'S5', 2:'S5'}) assert_array_equal(adapter[:], np.array([('1', '2', '3')], dtype='S5,S5,S5')) data = io.BytesIO(b'1,2,3\n') adapter = iopro.text_adapter(data, field_names=False) adapter.set_field_types({0:'S5', 1:'S5', 2:'S5'}) assert_array_equal(adapter[:], np.array([('1', '2', '3')], dtype='S5,S5,S5'))
def test_converters(self): data = StringIO() generate_dataset(data, IntIter(), ',', self.num_records) adapter = iopro.text_adapter(data, delimiter=',', field_names=False) #adapter.set_field_types({0:'u4', 1:'u4', 2:'u4', 3:'u4', 4:'u4'}) def increment(input_str): return int(input_str) + 1 def double(input_str): return int(input_str) + int(input_str) if sys.platform == 'win32' and tuple.__itemsize__ == 8: # TODO: there problems below here 64-bit Windows, I get # OverflowError: can't convert negative value to unigned PY_LONG_LONG return adapter.set_converter(0, increment) adapter.set_converter('f1', double) array = adapter[:] self.assert_equality(array.size, self.num_records) record = [1, 2, 2, 3, 4] for i in range(0, self.num_records): self.assert_equality(array[i].item(), tuple(record)) record[0] += 5 record[1] = (10 * (i+1)) + 2 record[2] += 5 record[3] += 5 record[4] += 5
def test_csv(self): # Test skipping blank lines data = StringIO('1,2,3\n\n4,5,6') adapter = iopro.text_adapter(data, field_names=False) array = adapter[:] assert_array_equal(array, np.array([(1,2,3), (4,5,6)], dtype=[('f0','<u8'),('f1','<u8'),('f2','<u8')]))
def test_spaces_around_numeric_values(self): data = StringIO(' 1 , -2 , 3.3 , -4.4 \n 5 , -6 , 7.7 , -8.8 ') adapter = iopro.text_adapter(data, field_names=False) adapter.set_field_types({0:'u4', 1:'i8', 2:'f4', 3:'f8'}) array = adapter[:] control = np.array([(1,-2,3.3,-4.4), (5,-6,7.7,-8.8)], dtype='u4,i8,f4,f8') assert_array_equal(array, control)
def test_auto_type_inference(self): data = StringIO('0,1,2,3,4\n5.5,6,7,8,9\n10,11,12,13,14a\n15,16,xxx,18,19') adapter = iopro.text_adapter(data, field_names=False, infer_types=True) array = adapter.to_array() self.assert_equality(array.dtype.fields['f0'][0], np.dtype('float64')) self.assert_equality(array.dtype.fields['f1'][0], np.dtype('uint64')) self.assert_equality(array.dtype.fields['f2'][0], np.dtype('O')) self.assert_equality(array.dtype.fields['f3'][0], np.dtype('uint64')) self.assert_equality(array.dtype.fields['f4'][0], np.dtype('O')) data = StringIO('0,1,2,3,4\n5.5,6,7,8,9\n10,11,12,13,14a\n15,16,xxx,18,19') adapter = iopro.text_adapter(data, field_names=False, infer_types=True) self.assert_equality(adapter[0].dtype.fields['f0'][0], np.dtype('uint64')) self.assert_equality(adapter[1:3].dtype.fields['f0'][0], np.dtype('float64')) self.assert_equality(adapter[3].dtype.fields['f4'][0], np.dtype('uint64')) self.assert_equality(adapter[:].dtype.fields['f3'][0], np.dtype('uint64')) self.assert_equality(adapter[-1].dtype.fields['f2'][0], np.dtype('O')) self.assert_equality(adapter[2].dtype.fields['f4'][0], np.dtype('O'))
def test_header_footer(self): data = StringIO('0,1,2,3,4\n5,6,7,8,9\n10,11,12,13,14') adapter = iopro.text_adapter(data, header=1, field_names=False) adapter.field_types = dict(zip(range(5), ['u4']*5)) assert_array_equal(adapter[:], np.array([(5,6,7,8,9), (10,11,12,13,14)], dtype='u4,u4,u4,u4,u4')) data.seek(0) adapter = iopro.text_adapter(data, header=2, field_names=False) adapter.field_types = dict(zip(range(5), ['u4']*5)) assert_array_equal(adapter[:], np.array([(10,11,12,13,14)], dtype='u4,u4,u4,u4,u4')) data.seek(0) adapter = iopro.text_adapter(data, header=1, field_names=True) adapter.field_types = dict(zip(range(5), ['u4']*5)) assert_array_equal(adapter[:], np.array([(10,11,12,13,14)], dtype=[('5','u4'),('6','u4'),('7','u4'),('8','u4'),('9','u4')]))
def test_no_whitespace_stripping(self): data = StringIO('1 ,2 ,3 \n') adapter = iopro.text_adapter(data, field_names=False) adapter.set_field_types({0:'S3', 1:'S3', 2:'S3'}) assert_array_equal(adapter[:], np.array([('1 ', '2 ', '3 ')], dtype='S3,S3,S3')) data = StringIO(' 1, 2, 3\n') adapter = iopro.text_adapter(data, field_names=False) adapter.set_field_types({0:'S3', 1:'S3', 2:'S3'}) assert_array_equal(adapter[:], np.array([(' 1', ' 2', ' 3')], dtype='S3,S3,S3')) data = StringIO(' 1 , 2 , 3 \n') adapter = iopro.text_adapter(data, field_names=False) adapter.set_field_types({0:'S5', 1:'S5', 2:'S5'}) assert_array_equal(adapter[:], np.array([(' 1 ', ' 2 ', ' 3 ')], dtype='S5,S5,S5')) data = StringIO('\t1\t,\t2\t,\t3\t\n') adapter = iopro.text_adapter(data, field_names=False) adapter.set_field_types({0:'S3', 1:'S3', 2:'S3'}) assert_array_equal(adapter[:], np.array([('\t1\t', '\t2\t', '\t3\t')], dtype='S3,S3,S3'))
def test_escapechar(self): data = StringIO('1,2\\2,3\n4,5\\5\\5,6') array = iopro.text_adapter(data, field_names=False)[:] assert_array_equal(array, np.array([(1,22,3), (4,555,6)], dtype='u8,u8,u8')) data = StringIO('\\1,2,3\n4,5,6\\') array = iopro.text_adapter(data, field_names=False)[:] assert_array_equal(array, np.array([(1,2,3), (4,5,6)], dtype='u8,u8,u8')) data = StringIO('a,b\\,b,c\na,b\\,b\\,b,c') array = iopro.text_adapter(data, field_names=False)[:] assert_array_equal(array, np.array([('a', 'b,b', 'c'), ('a', 'b,b,b', 'c')], dtype='O,O,O')) data = StringIO('a,bx,b,c\na,bx,bx,b,c') array = iopro.text_adapter(data, field_names=False, escape='x')[:] assert_array_equal(array, np.array([('a', 'b,b', 'c'), ('a', 'b,b,b', 'c')], dtype='O,O,O'))
def test_float_conversion(self): data = StringIO('10,1.333,-1.23,10.0E+2,999.9e-2') adapter = iopro.text_adapter(data, field_names=False, infer_types=False) adapter.set_field_types(dict(zip(range(5), ['f8']*5))) array = adapter[0] #self.assert_equality(array[0].item(), (10.0,1.333,-1.23,1000.0,9.999)) self.assertAlmostEqual(array[0][0], 10.0) self.assertAlmostEqual(array[0][1], 1.333) self.assertAlmostEqual(array[0][2], -1.23) self.assertAlmostEqual(array[0][3], 1000.0) self.assertAlmostEqual(array[0][4], 9.999)
def test_delimiter(self): data = StringIO('1,2,3\n') adapter = iopro.text_adapter(data, field_names=False) self.assert_equality(adapter[0].item(), (1,2,3)) data = StringIO('1 2 3\n') adapter = iopro.text_adapter(data, field_names=False) self.assert_equality(adapter[0].item(), (1,2,3)) data = StringIO('1\t2\t3\n') adapter = iopro.text_adapter(data, field_names=False) self.assert_equality(adapter[0].item(), (1,2,3)) data = StringIO('1x2x3\n') adapter = iopro.text_adapter(data, field_names=False) self.assert_equality(adapter[0].item(), (1,2,3)) # Test no delimiter in single field csv data data = StringIO('aaa\nbbb\nccc') array = iopro.text_adapter(data, field_names=False, delimiter=None)[:] assert_array_equal(array, np.array([('aaa',), ('bbb',), ('ccc',)], dtype=[('f0', 'O')]))
def test_field_names(self): # Test for ignoring of extra fields data = StringIO('f0,f1\n0,1,2\n3,4,5') adapter = iopro.text_adapter(data, 'csv', delimiter=',', field_names=True) array = adapter.to_array() self.assert_equality(array.dtype.names, ('f0', 'f1')) self.assert_equality(array[0].item(), (0,1)) self.assert_equality(array[1].item(), (3,4)) # Test for duplicate field names data = StringIO('f0,field,field\n0,1,2\n3,4,5') adapter = iopro.text_adapter(data, 'csv', delimiter=',', field_names=True, infer_types=False) adapter.set_field_types({0:'u4', 1:'u4', 2:'u4'}) array = adapter.to_array() self.assert_equality(array.dtype.names, ('f0', 'field', 'field1')) # Test for field names list data = StringIO('0,1,2\n3,4,5') adapter = iopro.text_adapter(data, field_names=['a', 'b', 'c'], infer_types=False) adapter.field_types = {0:'u4', 1:'u4', 2:'u4'} array = adapter[:] self.assertTrue(array.dtype.names == ('a', 'b', 'c')) assert_array_equal(array, np.array([(0,1,2), (3,4,5)], dtype=[('a', 'u4'), ('b', 'u4'), ('c', 'u4')]))
def reduce(iter,out, params): import ftplib,os ftp = ftplib.FTP('ftp.ncdc.noaa.gov') ftp.login() gsod_path = 'pub/data/gsod/' dirpath = '/tmp/weather_files_coverage/' if not os.path.exists(dirpath): os.makedirs(dirpath) for key, StatRange in iter: stations = list(StatRange[0]) ranges = StatRange[1] for date in ranges: for stat in stations: cache = open(dirpath+stat+'-'+str(date)+'.op.gz','wb') f = gsod_path+str(date)+'/'+stat+'-'+str(date)+'.op.gz' try: ftp.retrbinary("RETR " + f, cache.write, 8*1024) except: ftp = ftplib.FTP('ftp.ncdc.noaa.gov') # 'Succesfully Connected...' ftp.login() ftp.retrbinary("RETR " + f, cache.write, 8*1024) try: cache.close() adapter = iopro.text_adapter(cache.name,compression='gzip',parser='csv', field_names=True) avg_temp = adapter[:]['TEMP'] except: stations.remove(stat) continue if (len(avg_temp) < 360): stations.remove(stat) continue if 9999.9 in avg_temp: stations.remove(stat) continue out.add(1,set(stations))
def test_generators(self): def int_generator(num_recs): for i in range(num_recs): yield ','.join([str(i*5), str(i*5+1), str(i*5+2), str(i*5+3), str(i*5+4)]) adapter = iopro.text_adapter(int_generator(self.num_records), field_names=False) array = adapter[:] self.assert_equality(array.size, self.num_records) record = [x for x in range(0, 5)] for i in range(0, self.num_records): self.assert_equality(array[i].item(), tuple(record)) record[0] += 5 record[1] += 5 record[2] += 5 record[3] += 5 record[4] += 5
def reduce(iter, out, params): import numpy as np import ftplib,os import iopro,shutil from disco.util import kvgroup for date, WeatherDateStat in kvgroup(iter): print date # print 'Connecting to NOAA...' ftp = ftplib.FTP('ftp.ncdc.noaa.gov') # print 'Succesfully Connected...' ftp.login() avg_temp = [] stdev = 0 SUM = 0 mean = 0 path = '/tmp/weather_files/'+str(date)+'/' if not os.path.exists(path): os.makedirs(path) for file in WeatherDateStat: cache = open(path+file.split('/')[-1],'wb') # print file try: ftp.retrbinary("RETR " + file, cache.write, 8*1024) except: ftp = ftplib.FTP('ftp.ncdc.noaa.gov') # 'Succesfully Connected...' ftp.login() ftp.retrbinary("RETR " + file, cache.write, 8*1024) cache.close() adapter = iopro.text_adapter(cache.name,compression='gzip',parser='csv', field_names=True) avg_temp = avg_temp + list(adapter[:]['TEMP']) # mean = (mean+adapter[:]['TEMP'].mean())/2.0 # stdev = np.sqrt(stdev**2+adapter[:]['TEMP'].std()**2)/2.0 adapter.close() print 'Date Mean Std: ', date, np.mean(avg_temp), np.std(avg_temp) out.add(date, (np.mean(avg_temp),np.std(avg_temp)))
def test_gzip_index(self): num_records = 1000000 data = StringIO() generate_dataset(data, IntIter(), ',', num_records) #if sys.version > '3': if True: dataz = io.BytesIO() else: dataz = StringIO() gzip_output = gzip.GzipFile(fileobj=dataz, mode='wb') #if sys.version > '3': if True: gzip_output.write(data.getvalue().encode('utf8')) else: gzip_output.write(data.getvalue()) gzip_output.close() dataz.seek(0) # test explicit index building adapter = iopro.text_adapter(dataz, compression='gzip', delimiter=',', field_names=False, infer_types=False) adapter.set_field_types({0:'u4',1:'u4',2:'u4',3:'u4',4:'u4'}) adapter.create_index() self.assert_equality(adapter[0].item(), tuple([(0*5) + x for x in range(5)])) self.assert_equality(adapter[10].item(), tuple([(10*5) + x for x in range(5)])) self.assert_equality(adapter[100].item(), tuple([(100*5) + x for x in range(5)])) self.assert_equality(adapter[1000].item(), tuple([(1000*5) + x for x in range(5)])) self.assert_equality(adapter[10000].item(), tuple([(10000*5) + x for x in range(5)])) self.assert_equality(adapter[100000].item(), tuple([(100000*5) + x for x in range(5)])) self.assert_equality(adapter[num_records - 1].item(), tuple([((num_records - 1)*5) + x for x in range(5)])) #self.assert_equality(adapter[-1].item(), tuple(expected_values)) # test 'trouble' records that have caused crashes in the past self.assert_equality(adapter[290000].item(), tuple([(290000*5) + x for x in range(5)])) self.assert_equality(adapter[818000].item(), tuple([(818000*5) + x for x in range(5)])) # test implicitly creating disk index on the fly # JNB: not implemented yet '''adapter = iopro.text_adapter(dataz, compression='gzip', delimiter=',', field_names=False, infer_types=False, indexing=True, index_filename='test.idx')
def test_json(self): # Test json number data = StringIO('{"id":123}') adapter = iopro.text_adapter(data, parser='json') array = adapter[:] assert_array_equal(array, np.array([(123,)], dtype=[('id', 'u8')])) # Test json number data = StringIO('{"id":"xxx"}') adapter = iopro.text_adapter(data, parser='json') array = adapter[:] assert_array_equal(array, np.array([('xxx',)], dtype=[('id', 'O')])) # Test multiple values data = StringIO('{"id":123, "name":"xxx"}') adapter = iopro.text_adapter(data, parser='json') array = adapter[:] assert_array_equal(array, np.array([(123, 'xxx',)], dtype=[('id', 'u8'), ('name', 'O')])) # Test multiple records data = StringIO('[{"id":123, "name":"xxx"}, {"id":456, "name":"yyy"}]') adapter = iopro.text_adapter(data, parser='json') array = adapter[:] assert_array_equal(array, np.array([(123, 'xxx',), (456, 'yyy')], dtype=[('id', 'u8'), ('name', 'O')])) # Test multiple objects separated by newlines data = StringIO('{"id":123, "name":"xxx"}\n{"id":456, "name":"yyy"}') adapter = iopro.text_adapter(data, parser='json') array = adapter[:] assert_array_equal(array, np.array([(123, 'xxx',), (456, 'yyy')], dtype=[('id', 'u8'), ('name', 'O')])) data = StringIO('{"id":123, "name":"xxx"}\n') adapter = iopro.text_adapter(data, parser='json') array = adapter[:] assert_array_equal(array, np.array([(123, 'xxx',)], dtype=[('id', 'u8'), ('name', 'O')])) # JNB: broken; should be really be supporting the following json inputs? ''' # Test subarrays data = StringIO('{"id":123, "names":["xxx","yyy","zzz"]}') adapter = iopro.text_adapter(data, parser='json') array = adapter[:] assert_array_equal(array, np.array([(123, 'xxx', 'yyy', 'zzz',)], dtype=[('f0', 'u8'), ('f1', 'O'), ('f2', 'O'), ('f3', 'O')])) # Test subobjects data = StringIO('{"id":123, "names":{"a":"xxx", "b":"yyy", "c":"zzz"}}') adapter = iopro.text_adapter(data, parser='json') array = adapter[:] assert_array_equal(array, np.array([(123, 'xxx', 'yyy', 'zzz',)], dtype=[('f0', 'u8'), ('f1', 'O'), ('f2', 'O'), ('f3', 'O')])) ''' # Test ranges data = StringIO('{"id": 1, "name": "www"}\n' '{"id": 2, "name": "xxx"}\n' '{"id": 3, "name": "yyy"}\n' '{"id": 4, "name": "zzz"}') adapter = iopro.text_adapter(data, parser='json') array = adapter[2:4] assert_array_equal(array, np.array([(3, 'yyy'), (4, 'zzz')], dtype=[('id', 'u8'), ('name', 'O')])) # Test column order data = StringIO('{"xxx": 1, "aaa": 2}\n') adapter = iopro.text_adapter(data, parser='json') array = adapter[:] assert_array_equal(array, np.array([(1, 2)], dtype=[('xxx', 'u8'), ('aaa', 'u8')])) # Test field filter data = StringIO('{"id": 1, "name": "www"}\n' '{"id": 2, "name": "xxx"}\n' '{"id": 3, "name": "yyy"}\n' '{"id": 4, "name": "zzz"}') adapter = iopro.text_adapter(data, parser='json') adapter.field_filter = ['name'] array = adapter[:] assert_array_equal(array, np.array([('www',), ('xxx',), ('yyy',), ('zzz',)], dtype=[('name', 'O')]))
def test_fixed_width(self): data = StringIO() generate_dataset(data, FixedWidthIter(), '', self.num_records) adapter = iopro.FixedWidthTextAdapter(data, [2,3,4,5,6], field_names=False, infer_types=False) adapter.set_field_types({0:'u4',1:'u4',2:'u4',3:'u4',4:'u4'}) array = adapter[:] self.assert_equality(array.size, self.num_records) record = [0, 0, 0, 0, 0] for i in range(0, self.num_records): self.assert_equality(array[i].item(), tuple(record)) record = [x+1 for x in record] if record[0] == 100: record[0] = 0 if record[1] == 1000: record[1] = 0 if record[2] == 10000: record[2] = 0 if record[3] == 100000: record[3] = 0 if record[4] == 1000000: record[4] = 0 # Test skipping blank lines data = StringIO(' 1 2 3\n\n 4 5 6') adapter = iopro.text_adapter(data, parser='fixed_width', field_widths=[2,2,2], field_names=False) array = adapter[:] assert_array_equal(array, np.array([(1,2,3), (4,5,6)], dtype=[('f0','<u8'),('f1','<u8'),('f2','<u8')])) # Test comment lines data = StringIO('# 1 2 3\n 1 2 3\n# foo\n 4 5 6') adapter = iopro.text_adapter(data, parser='fixed_width', field_widths=[2,2,2], field_names=False) array = adapter[:] assert_array_equal(array, np.array([(1,2,3), (4,5,6)], dtype=[('f0','<u8'),('f1','<u8'),('f2','<u8')])) # Test field names line data = StringIO(' a b c\n 1 2 3') adapter = iopro.text_adapter(data, parser='fixed_width', field_widths=[2,2,2], field_names=True) array = adapter[:] assert_array_equal(array, np.array([(1,2,3)], dtype=[('a','<u8'),('b','<u8'),('c','<u8')])) # Test field names line as comment line data = StringIO('# a b c\n 1 2 3') adapter = iopro.text_adapter(data, parser='fixed_width', field_widths=[2,2,2], field_names=True) array = adapter[:] assert_array_equal(array, np.array([(1,2,3)], dtype=[('a','<u8'),('b','<u8'),('c','<u8')])) # Test incomplete field names line data = StringIO(' a\n 1 2 3') adapter = iopro.text_adapter(data, parser='fixed_width', field_widths=[2,2,2], field_names=True) array = adapter[:] assert_array_equal(array, np.array([(1,2,3)], dtype=[('a','<u8'),('f1','<u8'),('f2','<u8')]))
def test_adapter_factory(self): data = StringIO("1,2,3") adapter = iopro.text_adapter(data, "csv", delimiter=',', field_names=False, infer_types=False) self.assertTrue(isinstance(adapter, iopro.CSVTextAdapter)) self.assertRaises(iopro.AdapterException, iopro.text_adapter, data, "foobar")
def test_64bit_ints(self): data = StringIO(str((2**63)-1) + ',' + str(((2**63)-1)*-1) + ',' + str((2**64)-1)) adapter = iopro.text_adapter(data, delimiter=',', field_names=False, infer_types=False) adapter.set_field_types({0:'i8', 1:'i8', 2:'u8'}) array = adapter.to_array() self.assert_equality(array[0].item(), ((2**63)-1, ((2**63)-1)*-1, (2**64)-1))
def test_slicing(self): data = StringIO() generate_dataset(data, IntIter(), ',', self.num_records) adapter = iopro.text_adapter(data, field_names=False) adapter.set_field_types({0:'u4',1:'u4',2:'u4',3:'u4',4:'u4'}) assert_array_equal(adapter[0], np.array([(0, 1, 2, 3, 4)], dtype='u4,u4,u4,u4,u4')) expected_values = [((self.num_records-1)*5)+x for x in range(5)] self.assert_equality(adapter[self.num_records-1].item(), tuple(expected_values)) #adapter.create_index() #self.assert_equality(adapter[-1].item(), tuple(expected_values)) self.assert_equality(adapter['f0'][0].item(), (0,)) self.assert_equality(adapter['f4'][1].item(), (9,)) #self.assert_equality(adapter[self.num_records-1]['f4'], (self.num_records*5)-1) array = adapter[:] record = [x for x in range(0, 5)] self.assert_equality(array.size, self.num_records) for i in range(0, self.num_records): self.assert_equality(array[i].item(), tuple(record)) record = [x+5 for x in record] array = adapter[:-1] record = [x for x in range(0, 5)] self.assert_equality(array.size, self.num_records-1) for i in range(0, self.num_records-1): self.assert_equality(array[i].item(), tuple(record)) record = [x+5 for x in record] array = adapter[0:10] self.assert_equality(array.size, 10) record = [x for x in range(0, 5)] for i in range(0, 10): self.assert_equality(array[i].item(), tuple(record)) record = [x+5 for x in record] array = adapter[1:] self.assert_equality(array.size, self.num_records-1) record = [x for x in range(5, 10)] for i in range(0, self.num_records-1): self.assert_equality(array[i].item(), tuple(record)) record = [x+5 for x in record] array = adapter[0:10:2] self.assert_equality(array.size, 5) record = [x for x in range(0, 5)] for i in range(0, 5): self.assert_equality(array[i].item(), tuple(record)) record = [x+10 for x in record] array = adapter[['f0', 'f4']][:] record = [0, 4] self.assert_equality(array.size, self.num_records) for i in range(0, self.num_records): self.assert_equality(array[i].item(), tuple(record)) record = [x+5 for x in record] adapter.field_filter = [0, 'f4'] array = adapter[:] record = [0, 4] self.assert_equality(array.size, self.num_records) for i in range(0, self.num_records): self.assert_equality(array[i].item(), tuple(record)) record = [x+5 for x in record] adapter.field_filter = None array = adapter[:] record = [0, 1, 2, 3, 4] self.assert_equality(array.size, self.num_records) for i in range(0, self.num_records): self.assert_equality(array[i].item(), tuple(record)) record = [x+5 for x in record] try: adapter[self.num_records] except iopro.AdapterIndexError: pass else: self.fail('AdaperIndexError not thrown') try: adapter[0:self.num_records+1] except iopro.AdapterIndexError: pass else: self.fail('AdaperIndexError not thrown')
import iopro import sys import pandas as pd df = pd.read_csv('station_complete.csv', header=False) df.columns = ['STDIN'] stations = pd.read_csv('ish-history.csv') adapter = iopro.text_adapter('station_complete.csv', parser='csv', delimiter=' ', field_names=False) usaf_list = list(adapter[:]['f0']) station_clean = stations['USAF'].map(lambda x: x in usaf_list) cleaned = stations[station_clean] lat = cleaned['LAT'] lon = cleaned['LON'] import matplotlib.pyplot as plt from mpl_toolkits.basemap import Basemap from mpl_toolkits.basemap import Basemap import numpy as np import matplotlib.pyplot as plt m = Basemap(projection='merc',llcrnrlat=-80,urcrnrlat=80,\ llcrnrlon=-180,urcrnrlon=180,lat_ts=20,resolution='c') m.drawcoastlines() m.fillcontinents(color='coral', lake_color='aqua')
def test_num_records(self): data = StringIO('0,1\n2,3\n4,5\n6,7\n8,9\n10,11\n12,13\n14,15\n16,17\n18,19') adapter = iopro.text_adapter(data, field_names=False, num_records=2) assert_array_equal(adapter[:], np.array([(0, 1), (2, 3)], dtype='u8,u8'))
for stat1973, v in stationIDs: cache = open(path+stat1973.split('/')[-1],'wb') try: ftp.retrbinary("RETR " + stat1973, cache.write, 8*1024) except: ftp = ftplib.FTP('ftp.ncdc.noaa.gov') # 'Succesfully Connected...' ftp.login() ftp.retrbinary("RETR " + stat1973, cache.write, 8*1024) cache.close() #skip tar file if stat1973.endswith('.op.gz'): adapter = iopro.text_adapter(cache.name,compression='gzip',parser='csv', field_names=True) avg_temp = adapter[:]['TEMP'] if (len(avg_temp) < 360) or (len(avg_temp) > 366): # print '\ttoo small or too big' continue if 9999.9 in avg_temp: continue else: stations.append(cache.name.split('/')[-1][:12]) #store station which has good coverage for the year out.add(1,stations) #yield list of stations
def test_stepping(self): data = StringIO('0,1\n2,3\n4,5\n6,7\n8,9\n10,11\n12,13\n14,15\n16,17\n18,19') adapter = iopro.text_adapter(data, field_names=False) assert_array_equal(adapter[::2], np.array([(0,1), (4,5), (8,9), (12,13), (16,17)], dtype='u8,u8')) assert_array_equal(adapter[::3], np.array([(0,1), (6,7), (12,13), (18,19)], dtype='u8,u8'))
def test_regex(self): data = StringIO() generate_dataset(data, IntIter(), ',', self.num_records) adapter = iopro.RegexTextAdapter(data, '([0-9]*),([0-9]*),([0-9]*),([0-9]*),([0-9]*)\n', field_names=False, infer_types=False) adapter.set_field_types({0:'u4',1:'u4',2:'u4',3:'u4',4:'u4'}) array = adapter[:] self.assert_equality(array.size, self.num_records) record = [x for x in range(0, 5)] for i in range(0, self.num_records): self.assert_equality(array[i].item(), tuple(record)) record = [x+5 for x in record] # Test skipping blank lines data = StringIO('1 2 3\n\n4 5 6') adapter = iopro.text_adapter(data, parser='regex', regex_string='([0-9]) ([0-9]) ([0-9])', field_names=False) array = adapter[:] assert_array_equal(array, np.array([(1,2,3), (4,5,6)], dtype=[('f0','<u8'),('f1','<u8'),('f2','<u8')])) # Test comment lines data = StringIO('#1 2 3\n1 2 3\n# foo\n4 5 6') adapter = iopro.text_adapter(data, parser='regex', regex_string='([0-9]) ([0-9]) ([0-9])', field_names=False) array = adapter[:] assert_array_equal(array, np.array([(1,2,3), (4,5,6)], dtype=[('f0','<u8'),('f1','<u8'),('f2','<u8')])) # Test field names line data = StringIO('a b c\n4 5 6') adapter = iopro.text_adapter(data, parser='regex', regex_string='([0-9]) ([0-9]) ([0-9])', field_names=True) array = adapter[:] assert_array_equal(array, np.array([(4,5,6)], dtype=[('a','<u8'),('b','<u8'),('c','<u8')])) # Test field names line as comment line data = StringIO('#a b c\n4 5 6') adapter = iopro.text_adapter(data, parser='regex', regex_string='([0-9]) ([0-9]) ([0-9])', field_names=True) array = adapter[:] assert_array_equal(array, np.array([(4,5,6)], dtype=[('a','<u8'),('b','<u8'),('c','<u8')])) # Test incomplete field names line data = StringIO('a b\n4 5 6') adapter = iopro.text_adapter(data, parser='regex', regex_string='([0-9]) ([0-9]) ([0-9])', field_names=True) array = adapter[:] assert_array_equal(array, np.array([(4,5,6)], dtype=[('a','<u8'),('b','<u8'),('f2','<u8')])) # Test field names line that doesn't match regex data = StringIO('a b c\n1 2 3 4 5 6') adapter = iopro.text_adapter(data, parser='regex', regex_string='([0-9\s]+) ([0-9\s]+) ([0-9\s]+)', field_names=True) array = adapter[:] assert_array_equal(array, np.array([('1 2', '3 4', '5 6')], dtype=[('a','O'),('b','O'),('c','O')]))
import os import iopro from blaze import Table, mean, std, params, select, open from blaze.algo.select import select2 adapter = iopro.text_adapter( 'noaa_gsod_example.op', header=1, infer_types=False, field_names=['x', 'y', 'z', 'w'] ) adapter.set_field_types({0:'u8', 1:'u8', 2:'u8', 3:'f8'}) def test_simple(): if not os.path.exists('./noaa_data'): p = params(clevel=5, storage='./noaa_data') t = Table([], dshape='{f0: int, f1:int, f2:int, f3:float}', params=p) # TODO: chunkwise copy t.append(adapter[:]) t.commit() else: t = open('ctable://noaa_data') print '--------------------------------------' print 'mean', mean(t, 'f3') print 'std', std(t, 'f2') print '--------------------------------------'
import iopro import sys import pandas as pd df = pd.read_csv('station_complete.csv',header=False) df.columns=['STDIN'] stations = pd.read_csv('ish-history.csv') adapter = iopro.text_adapter('station_complete.csv',parser='csv',delimiter=' ',field_names=False) usaf_list = list(adapter[:]['f0']) station_clean = stations['USAF'].map(lambda x: x in usaf_list) cleaned = stations[station_clean] lat = cleaned['LAT'] lon = cleaned['LON'] import matplotlib.pyplot as plt from mpl_toolkits.basemap import Basemap from mpl_toolkits.basemap import Basemap import numpy as np import matplotlib.pyplot as plt m = Basemap(projection='merc',llcrnrlat=-80,urcrnrlat=80,\ llcrnrlon=-180,urcrnrlon=180,lat_ts=20,resolution='c') m.drawcoastlines() m.fillcontinents(color='coral',lake_color='aqua')