def infer_types(f, sample_size=100): reader = CSVKitReader(f) headers = reader.next() sample = islice(reader, sample_size) normal_types, normal_values = normalize_table(sample) return zip(headers, [t.__name__ for t in normal_types])
def test_normalize_table_known_types_invalid(self): normal_types = [bool, int, int, NoneType] data = [[u'a', u'1', u'2.1', u''], [u'b', u'5', u'4.1'], [u'c', u'100', u'100.9999', u''], [u'd', u'2', u'5.3', u'']] try: typeinference.normalize_table(data, normal_types, accumulate_errors=True) self.assertEqual(True, False) except InvalidValueForTypeListException as e: self.assertEqual(len(e.errors), 2) self.assertEqual(e.errors[0].index, 0) self.assertEqual(e.errors[0].value, 'a') self.assertEqual(e.errors[0].normal_type, bool) self.assertEqual(e.errors[2].index, 0) self.assertEqual(e.errors[2].value, '2.1') self.assertEqual(e.errors[2].normal_type, int)
def test_normalize_table_known_types_invalid(self): normal_types = [bool, int, int, NoneType] data = [ [u'a', u'1', u'2.1', u''], [u'b', u'5', u'4.1'], [u'c', u'100', u'100.9999', u''], [u'd', u'2', u'5.3', u''] ] try: typeinference.normalize_table(data, normal_types, accumulate_errors=True) self.assertEqual(True, False) except InvalidValueForTypeListException, e: self.assertEqual(len(e.errors), 2) self.assertEqual(e.errors[0].index, 0) self.assertEqual(e.errors[0].value, 'a') self.assertEqual(e.errors[0].normal_type, bool) self.assertEqual(e.errors[2].index, 0) self.assertEqual(e.errors[2].value, '2.1') self.assertEqual(e.errors[2].normal_type, int)
def infer_schema(f, sample_size=100): reader = CSVKitReader(f) headers = reader.next() sample = islice(reader, sample_size) normal_types, normal_values = normalize_table(sample) type_names = [t.__name__ for t in normal_types] return [{ 'column': h, 'simple_type': t, 'meta_type': None, 'indexed': False } for h, t in zip(headers, type_names)]
def test_normalize_table_known_types(self): normal_types = [six.text_type, int, float, NoneType] data = [[u'a', u'1', u'2.1', u''], [u'b', u'5', u'4.1'], [u'c', u'100', u'100.9999', u''], [u'd', u'2', u'5.3', u'']] types, columns = typeinference.normalize_table(data, normal_types) self.assertEqual(4, len(types)) self.assertEqual(4, len(columns)) for i, tup in enumerate(zip(columns, types, normal_types)): c, t, et = tup self.assertEqual(et, t) for row, normalized in zip(data, c): if t is NoneType: self.assertTrue(normalized is None) else: self.assertEqual(t(row[i]), normalized)
def test_normalize_table(self): expected_types = [unicode, int, float, None] data = [['a', '1', '2.1', ''], ['b', '5', '4.1', ''], ['c', '100', '100.9999', ''], ['d', '2', '5.3', '']] column_count = len(expected_types) types, columns = typeinference.normalize_table(data, column_count) self.assertEqual(column_count, len(types)) self.assertEqual(column_count, len(columns)) for i, tup in enumerate(zip(columns, types, expected_types)): c, t, et = tup self.assertEqual(et, t) for row, normalized in zip(data, c): if t is None: self.assertTrue(normalized is None) self.assertEqual('', row[i]) else: self.assertEqual(t(row[i]), normalized)
def test_normalize_table_known_types(self): normal_types = [unicode, int, float, NoneType] data = [ [u'a', u'1', u'2.1', u''], [u'b', u'5', u'4.1'], [u'c', u'100', u'100.9999', u''], [u'd', u'2', u'5.3', u''] ] types, columns = typeinference.normalize_table(data, normal_types) self.assertEqual(4, len(types)) self.assertEqual(4, len(columns)) for i, tup in enumerate(zip(columns, types, normal_types)): c, t, et = tup self.assertEqual(et, t) for row, normalized in zip(data, c): if t is NoneType: self.assertTrue(normalized is None) else: self.assertEqual(t(row[i]), normalized)
def guess_column_types(path, dialect, sample_size, encoding='utf-8'): """ Guess column types based on a sample of data. """ with open(path, 'r') as f: reader = CSVKitReader(f, encoding=encoding, **dialect) headers = reader.next() sample = islice(reader, sample_size) normal_types, normal_values = normalize_table(sample) type_names = [] for t in normal_types: # csvkit recognizes dates and times separately, but we lump them together if t in [datetime.date, datetime.time]: type_names.append('datetime') else: type_names.append(t.__name__) return type_names
def test_normalize_table(self): expected_types = [unicode,int,float,None] data = [ ['a','1','2.1', ''], ['b', '5', '4.1', ''], ['c', '100', '100.9999', ''], ['d', '2', '5.3', ''] ] column_count = len(expected_types) types, columns = typeinference.normalize_table(data,column_count) self.assertEqual(column_count, len(types)) self.assertEqual(column_count, len(columns)) for i,tup in enumerate(zip(columns,types,expected_types)): c, t, et= tup self.assertEqual(et, t) for row,normalized in zip(data,c): if t is None: self.assertTrue(normalized is None) self.assertEqual('', row[i]) else: self.assertEqual(t(row[i]),normalized)
def guess_column_types(path, dialect, sample_size, encoding='utf-8'): """ Guess column types based on a sample of data. """ with open(path, 'r') as f: reader = CSVKitReader(f, encoding=encoding, **dialect) headers = reader.next() sample = islice(reader, sample_size) normal_types, normal_values = normalize_table(sample) type_names = [] for t in normal_types: if t is NoneType: type_names.append(None) else: type_names.append(t.__name__) # If a final column had no values csvkit will have dropped it while len(type_names) < len(headers): type_names.append(None) return type_names
def handle(self, *args, **kwargs): solr = sunburnt.SolrInterface("http://localhost:8983/solr/") reader = csvkit.CSVKitReader(open('data/Building_Permits.csv', 'r')) headers = reader.next() first_hundred = islice(reader, 200) normal_types, normal_values = normalize_table(first_hundred, len(headers)) solr_fields = [] for h, t in zip(headers, normal_types): if t == NoneType: solr_fields.append(None) else: solr_fields.append('%s_%s' % (h, t.__name__)) # Reset reader reader = csvkit.CSVKitReader(open('data/Building_Permits.csv', 'r')) reader.next() buffered = [] normal_type_exceptions = [] # TEMP reader = islice(reader, 1000) for i, row in enumerate(reader, start=1): data = {} for t, header, field, value in izip(normal_types, headers, solr_fields, row): try: value = normalize_column_type([value], normal_type=t)[1][0] except InvalidValueForTypeException: # Convert exception to row-specific error normal_type_exceptions.append(InferredNormalFalsifiedException(i, header, value, t)) continue # No reason to send null fields to Solr (also sunburnt doesn't like them) if value == None: continue if t in [unicode, bool, int, float]: if value == None: continue data[field] = value elif t == datetime: data[field] = value.isoformat() elif t == date: pass elif t == time: pass else: # Note: if NoneType should never fall through to here raise TypeError('Unexpected normal type: %s' % t.__name__) # If we've had a normal type exception, don't bother do the rest of this if not normal_type_exceptions: data['id'] = str(i) data['dataset_id'] = DATASET_ID data['full_text'] = '\n'.join(row) buffered.append(data) if i % 100 == 0: solr.add(buffered) buffered = [] if not normal_type_exceptions: solr.commit() else: # Rollback pending changes solr.delete(queries=solr.query(dataset_id=DATASET_ID)) for e in normal_type_exceptions: print e