def test_iter_invalid_extra_cols_handled(): # Test a schema-invalid extra column in one row source = [ ['key', 'value'], ['one', 1, 'unexpected'], ['two', 2], ] table = Table(source, schema=SCHEMA_MIN) errors = [] def handler(exc, row_number, row_data, error_data): errors.append((exc, row_number, row_data, error_data)) actual = list(table.iter(exc_handler=handler)) expect = [ ['one', 1], ['two', 2], ] assert actual == expect assert len(errors) == 1 expect_row_data = OrderedDict([('key', 'one'), ('value', 1), ('tableschema-cast-error-extra-col-3', 'unexpected')]) _check_error(errors[0], expect_exc_class=exceptions.CastError, expect_exc_str='Row length', expect_row_number=2, expect_row_data=expect_row_data, expect_error_data=expect_row_data)
def test_iter_with_headers_field_names_mismatch_stream_closed(): table = Table('data/data_headers_field_names_mismatch.csv', schema=SCHEMA_CSV) with pytest.raises(exceptions.CastError) as excinfo: for _ in table.iter(): pass assert table._Table__stream.closed
def test_iter_unique_primary_key_violation_handled(): # Test exception handler option to switch off fail-fast data validation # behaviour schema = deepcopy(SCHEMA_CSV) schema['primaryKey'] = 'id' source = [ ['id', 'age', 'name'], [1, 39, 'Paul'], [1, 36, 'Jane'], ] table = Table(source, schema=schema) errors = [] def handler(exc, row_number, row_data, error_data): errors.append((exc, row_number, row_data, error_data)) expect = [ [1, 39, 'Paul'], [1, 36, 'Jane'], ] actual = list(table.iter(exc_handler=handler)) assert actual == expect assert len(errors) == 1 exc, row_number, row_data, error_data = errors[0] assert isinstance(exc, exceptions.UniqueKeyError) assert row_number == 3 # actual row number including header line assert row_data == OrderedDict([('id', 1), ('age', 36), ('name', 'Jane')]) assert error_data == OrderedDict([('id', 1)]) assert 'duplicates' in str(exc)
def test_iter_missing_cols_handled(): source = [ ['key', 'value'], [ 'one', ], ['two', 2], ] table = Table(source, schema=SCHEMA_MIN) errors = [] def handler(exc, row_number, row_data, error_data): errors.append((exc, row_number, row_data, error_data)) actual = list(table.iter(exc_handler=handler)) expect = [ ['one', None], ['two', 2], ] assert actual == expect expect_row_data = OrderedDict([('key', 'one'), ('value', None)]) _check_error(errors[0], expect_exc_class=exceptions.CastError, expect_exc_str='Row length', expect_row_number=2, expect_row_data=expect_row_data, expect_error_data=expect_row_data)
def test_iter_with_headers_field_names_mismatch_handled(): source = [ ['id', 'bad', 'name'], [1, 39, 'Paul'], [2, 42, 'Peter'], ] table = Table(source, schema=SCHEMA_CSV) errors = [] def handler(exc, row_number, row_data, error_data): errors.append((exc, row_number, row_data, error_data)) expect = [] actual = list(table.iter(exc_handler=handler)) assert actual == expect assert len(errors) == 2 for i, error in enumerate(errors): expect_keyed_row_data = OrderedDict(zip(source[0], source[i + 1])) exc, row_number, row_data, error_data = error assert isinstance(exc, exceptions.CastError) assert row_number == i + 2 # actual row number including header line assert row_data == expect_keyed_row_data assert error_data == expect_keyed_row_data assert 'match schema field names' in str(exc)
def test_iter_single_field_foreign_key_invalid_handled(): relations = deepcopy(FK_RELATIONS) relations['people'][2]['firstname'] = 'Max' table = Table(FK_SOURCE, schema=FK_SCHEMA) errors = [] def handler(exc, row_number, row_data, error_data): errors.append((exc, row_number, row_data, error_data)) expect = [ ['1', { 'firstname': 'Alex', 'surname': 'Martin' }, 'Martin'], ['2', { 'firstname': 'John', 'surname': 'Dockins' }, 'Dockins'], ['3', {}, 'White'], ] actual = list(table.iter(relations=relations, exc_handler=handler)) assert actual == expect assert len(errors) == 1 exc, row_number, row_data, error_data = errors[0] assert row_number == 4 expect_keyed_row_data = OrderedDict(zip(FK_SOURCE[0], FK_SOURCE[3])) assert row_data == expect_keyed_row_data assert error_data == OrderedDict([('name', 'Walter')]) assert isinstance(exc, exceptions.UnresolvedFKError) assert 'Foreign key' in str(exc)
def test_iter_invalid_col_value_handled(): # Test a schema-invalid column value in one row, handled source = [ ['key', 'value'], ['one', 'not_an_int'], ['two', 2], ] table = Table(source, schema=SCHEMA_MIN) errors = [] def handler(exc, row_number, row_data, error_data): errors.append((exc, row_number, row_data, error_data)) actual = list(table.iter(exc_handler=handler)) expect = [ ['one', 'not_an_int'], ['two', 2], ] assert actual == expect assert isinstance(actual[0][1], FailedCast) assert len(errors) == 1 expect_row_data = OrderedDict([('key', 'one'), ('value', 'not_an_int')]) expect_error_data = OrderedDict([('value', 'not_an_int')]) _check_error(errors[0], expect_exc_class=exceptions.CastError, expect_exc_str='There are 1 cast errors', expect_row_number=2, expect_row_data=expect_row_data, expect_error_data=expect_error_data)
def test_iter_invalid_extra_cols_stream_closed(): table = Table('data/data_invalid_extra_cols.csv', schema=SCHEMA_MIN) with pytest.raises(exceptions.CastError) as excinfo: for _ in table.iter(): pass # Circumvent name mangling to get at (overly private ;-)) # __stream attribute assert table._Table__stream.closed
def test_iter_unique_primary_key_violation_stream_closed(): schema = deepcopy(SCHEMA_CSV) schema['primaryKey'] = 'id' table = Table('data/data_unique_primary_key_violation.csv', schema=schema) with pytest.raises(exceptions.TableSchemaException) as excinfo: for _ in table.iter(): pass assert table._Table__stream.closed
def test_iter_single_field_foreign_key_invalid(): relations = deepcopy(FK_RELATIONS) relations['people'][2]['firstname'] = 'Max' table = Table(FK_SOURCE, schema=FK_SCHEMA) with pytest.raises(exceptions.RelationError) as excinfo: for _ in table.iter(relations=relations): pass assert isinstance(excinfo.value, exceptions.UnresolvedFKError) assert 'Foreign key' in str(excinfo.value)
def test_iter_with_headers_field_names_mismatch(): source = [ ['id', 'bad', 'name'], [1, 39, 'Paul'], ] table = Table(source, schema=SCHEMA_CSV) with pytest.raises(exceptions.CastError) as excinfo: for _ in table.iter(): pass assert 'match schema field names' in str(excinfo.value)
def test_iter_invalid_extra_cols(): source = [ ['key', 'value'], ['one', 1, 'unexpected'], ['two', 2], ] table = Table(source, schema=SCHEMA_MIN) with pytest.raises(exceptions.CastError) as excinfo: for _ in table.iter(): pass assert 'Row length' in str(excinfo.value)
def reindex(self): file_path, params = self.get_file_info() t = Table(file_path, ignore_blank_headers=True, **params) schema = t.infer() data = t.iter(keyed=True) self.storage.create(self.index_name, schema, reindex=True, always_recreate=True) self.storage.write(self.index_name, data) for res in self.storage.write(self.index_name, data): pass
def test_iter_unique_primary_key_violation(): schema = deepcopy(SCHEMA_CSV) schema['primaryKey'] = 'id' source = [ ['id', 'age', 'name'], [1, 39, 'Paul'], [1, 36, 'Jane'], ] table = Table(source, schema=schema) with pytest.raises(exceptions.TableSchemaException) as excinfo: for _ in table.iter(): pass assert isinstance(excinfo.value, exceptions.UniqueKeyError) assert 'duplicates' in str(excinfo.value)
def test_iter_invalid_col_value_no_cast(): # Test a schema-invalid column value in one row, without value-casting source = [ ['key', 'value'], ['one', 'not_an_int'], ['two', 2], ] expect = [ ['one', 'not_an_int'], ['two', 2], ] table = Table(source, schema=SCHEMA_MIN) actual = list(table.iter(cast=False)) # no actual casting, no cast errors assert actual == expect
def test_iter_invalid_col_value(): # Test a schema-invalid column value in one row source = [ ['key', 'value'], ['one', 'not_an_int'], ['two', 2], ] table = Table(source, schema=SCHEMA_MIN) with pytest.raises(exceptions.CastError) as excinfo: for _ in table.iter(): pass assert 'There are 1 cast errors' in str(excinfo.value) error = excinfo.value.errors[0] assert isinstance(error, exceptions.CastError) assert ('Field "value" can\'t cast value "not_an_int" for type "integer"' in str(error))
def load_data_from_local_csv(csv_file=ASSET_DATA_FILE): table = Table(csv_file, schema=SCHEMA_FILE) try: valid = validate(table.schema.descriptor) if valid: for keyed_row in table.iter(keyed=True): yield keyed_row except exceptions.ValidationError as exception: for error in exception.errors: print(error) except exceptions.CastError as exception: if not exception.errors: print(exception) for error in exception.errors: write_skipped_assets(error, [])
def test_iter_invalid_col_value_handled_no_cast(): # Test a schema-invalid column value in one row, without value-casting source = [ ['key', 'value'], ['one', 'not_an_int'], ['two', 2], ] expect = [ ['one', 'not_an_int'], ['two', 2], ] table = Table(source, schema=SCHEMA_MIN) errors = [] def handler(exc, row_number, row_data, error_data): errors.append((exc, row_number, row_data, error_data)) actual = list(table.iter(cast=False, exc_handler=handler)) # no actual casting, no cast errors assert len(errors) == 0 assert actual == expect
class CsvReader(DataReader): '''this is a wrapper class for tableschema library''' def __init__(self, filePath): self._filePath = filePath self._table = Table(filePath) def close(self): pass def getColumns(self): if not self._table.headers: self._table.infer( settings.SCHEMA_INFER_LIMIT) # clean the headers result = [] for header in self._table.headers: tmpheader = header.lower() tmpheader = tmpheader.replace(' ', '_').replace('-', '_') r = re.search('\w+', tmpheader) if r: result.append(r.group()) else: raise InvalidCsvHeaderException( '%s is not a valid header' % header) return result def requery(self): self._table = Table(self._filePath) def getRow(self): i = self._table.iter(cast=True) return next(i) def getRowsList(self): self._table.infer() self._table.schema.descriptor[ 'missingValues'] = settings.SCHEMA_CSV_MISSING_VALUES self._table.schema.commit() i = self._table.iter(cast=True) return list(map(tuple, i)) def getSchema(self): ''' Get data schema infered from records, the number of records is defined by SCHEMA_INFER_LIMIT the confidence trashold is defined by SCHEMA_INFER_CONFIDENCE ''' t = Table(self._filePath) t.infer() t.schema.descriptor[ 'missingValues'] = settings.SCHEMA_CSV_MISSING_VALUES t.schema.commit() return t.infer( settings.SCHEMA_INFER_LIMIT, confidence=settings.SCHEMA_INFER_CONFIDENCE) def __repr__(self): return (settings.SOURCE_TYPE_CSV_PREFIX + self._filePath)
from tableschema import Table # Data from WEB, schema from MEMORY SOURCE = 'https://raw.githubusercontent.com/frictionlessdata/tableschema-py/master/data/data_infer.csv' SCHEMA = { 'fields': [{ 'name': 'id', 'type': 'integer' }, { 'name': 'age', 'type': 'integer' }, { 'name': 'name', 'type': 'string' }] } # If schema is not passed it will be inferred table = Table(SOURCE, schema=SCHEMA) rows = table.iter() while True: try: print(next(rows)) except StopIteration: break except Exception as exception: print(exception)
from tableschema import Table fileCSV = 'D:\dct\enem-microdados\DADOS_ENEM_2009.csv' fileJSON = 'D:\dct\enem-microdados\DADOS_ENEM_2009.json' # Create table table = Table(fileCSV, schema=fileJSON) # Print schema descriptor print(table.schema.descriptor) print "\n" # Print cast rows in a dict form for keyed_row in table.iter(keyed=True): print(keyed_row) print "\n"
from tableschema import Table # Data from WEB, schema from MEMORY SOURCE = 'https://raw.githubusercontent.com/frictionlessdata/tableschema-py/master/data/data_infer.csv' SCHEMA = {'fields': [{'name': 'id', 'type': 'integer'}, {'name': 'age', 'type': 'integer'}, {'name': 'name', 'type': 'string'}] } # If schema is not passed it will be inferred table = Table(SOURCE, schema=SCHEMA) rows = table.iter() while True: try: print(next(rows)) except StopIteration: break except Exception as exception: print(exception)
def loadPeptides(self, datapackage, datasetId, row_start=0, row_stop=None): """Load Peptide Data""" # Get the Ontology Version ontology_version = oceanproteinportal.datapackage.getDatapackageOntologyVersion(datapackage) peptideResource = oceanproteinportal.datapackage.findResource(datapackage=datapackage, resource_type='peptide') if peptideResource is None: return datasetCruises = datapackageCruises(datapackage) table = Table( peptideResource.descriptor['path'], schema=peptideResource.descriptor['schema'] ) if (0 < row_start): logging.info("Skipping rows until # %s" % (row_start)) row_count = 0 data = None PEPTIDE_FIELDS = getOntologyMappingFields(type='peptide', ontology_version=ontology_version) for keyed_row in table.iter(keyed=True): row_count += 1 if row_count < row_start: logging.debug("Skipping Row # %s" % (row_count)) continue if row_stop is not None and row_count > row_stop: logging.info("Stopping at Row# %s" % (row_count)) break logging.debug("Reading Row# %s" % (row_count)) data = readKeyedTableRow(keyed_row=keyed_row, elastic_mappings=PEPTIDE_FIELDS) primaryKey = datasetId + data.get('sampleName') + data.get('proteinId') + data.get('peptideSequence') data['guid'] = generateGuid( datapackage.descriptor['name'] + '_peptide_' + primaryKey ) filterSize = {} minimumFilterSize = data.get('filterSize:minimum', None) maximumFilterSize = data.get('filterSize:maximum', None) filterSizeLabel = '' if minimumFilterSize is not None: del data['filterSize:minimum'] filterSize['minimum'] = minimumFilterSize filterSizeLabel += str(minimumFilterSize) if maximumFilterSize is not None: del data['filterSize:maximum'] filterSize['maximum'] = maximumFilterSize if filterSizeLabel != '': filterSizeLabel += ' - ' + str(maximumFilterSize) else: filterSize += str(maximumFilterSize) filterSize['label'] = filterSizeLabel data['filterSize'] = filterSize if ('coordinate:lat' in data and 'coordinate:lon' in data): data['coordinate'] = { 'lat': data['coordinate:lat'], 'lon': data['coordinate:lon'] } del data['coordinate:lat'] del data['coordinate:lon'] # load in ES self.load(data=data, type='peptide', id=data['guid']) logging.info(res['result'])
def loadProteins(self, datapackage, datasetId, row_start=0, row_stop=None): """Load Protein Data Tabular data, so proteins may be repeated for different samples, stations, depths, etc. 1) Build proteinId first, then lookup if it exists in the store 2) If not exists, build a new document. Else, update the spectral counts of existing doc """ es = self.getStore() index = self.getIndex() # Get the Ontology Version ontology_version = oceanproteinportal.datapackage.getDatapackageOntologyVersion(datapackage) proteinResource = oceanproteinportal.datapackage.findResource(datapackage=datapackage, resource_type='protein') if proteinResource is None: return datasetCruises = oceanproteinportal.datapackage.datapackageCruises(datapackage) table = Table(proteinResource.descriptor['path'], schema=proteinResource.descriptor['schema']) if (0 < row_start): logging.info("Skipping rows until # %s" % (row_start)) row_count = 0 proteinId = None data = None PROTEIN_FIELDS = getOntologyMappingFields(type='protein', ontology_version=ontology_version) try: for keyed_row in table.iter(keyed=True): row_count += 1 if row_count < row_start: logging.debug("Skipping Row # %s" % (row_count)) continue if row_stop is not None and row_count > row_stop: logging.info("Stopping at Row# %s" % (row_count)) break logging.debug("Reading Row# %s" % (row_count)) row = readKeyedTableRow(keyed_row=keyed_row, elastic_mappings=PROTEIN_FIELDS) # Get the unqiue identifier for this protein proteinId = row['proteinId'] protein_guid = generateGuid( datapackage.descriptor['name'] + '_protein_' + datasetId + ':' + proteinId ) try: res = es.get(index=index, doc_type='protein', id=protein_guid) # Reuse existing protein document data = res['_source'] except elasticsearch.exceptions.NotFoundError as exc: # Build a new ES Protein document data = { '_dataset': datasetId, 'guid': protein_guid, 'proteinId': proteinId, 'spectralCount': [] } if row['productName'] is not None: data['productName'] = row['productName'] if row['molecularWeight'] is not None: data['molecularWeight'] = row['molecularWeight'] if row['enzymeCommId'] is not None: data['enzymeCommId'] = row['enzymeCommId'] if row['uniprotId'] is not None: data['uniprotId'] = row['uniprotId'] if row['otherIdentifiedProteins'] is not None: data['otherIdentifiedProteins'] = row['otherIdentifiedProteins'] # NCBI ncbiTaxon = None if 'ncbi:id' in row: ncbiTaxon = { 'id': row['ncbi:id'], 'name': None } if 'ncbi:name' in row: ncbiTaxon['name'] = row['ncbi:name'] if ncbiTaxon is not None: data['ncbiTaxon'] = ncbiTaxon # Kegg kegg_pathway = None pathway = row.get('kegg:path', None) if pathway is not None: kegg_pathway = [] for idx,path in enumerate(pathway): kegg_pathway.append({'value': path, 'index': idx}) data['kegg'] = { 'id': row.get('kegg:id', None), 'description': row.get('kegg:desc', None), 'pathway': kegg_pathway } # PFams if 'pfams:id' in row: data['pfams'] = { 'id': data.get('pfams:id', None), 'name': data.get('pfams:name', None) } # END of initial protein data setup # Handle all the unqiue row data for a certain protein # FilterSize filterSize = {} minimumFilterSize = row.get('filterSize:minimum', None) maximumFilterSize = row.get('filterSize:maximum', None) filterSizeLabel = '' if minimumFilterSize is not None: filterSize['minimum'] = minimumFilterSize filterSizeLabel += str(minimumFilterSize) if maximumFilterSize is not None: filterSize['maximum'] = maximumFilterSize if filterSizeLabel != '': filterSizeLabel += ' - ' + str(maximumFilterSize) else: filterSize += str(maximumFilterSize) if filterSizeLabel != '': filterSize['label'] = filterSizeLabel data['filterSize'] = filterSize # Cruise cruise = { 'value': row.get('spectralCount:cruise', None), } if cruise['value'] in datasetCruises: cruise['uri'] = datasetCruises[cruise['value']]['uri'] # To-do # 1. Lookup the cruise URI in the datapackage # Spectral Counts # fix ISO DateTime observationDateTime = None if 'spectralCount:dateTime' in row and row['spectralCount:dateTime'] is not None: observationDateTime = dateutil.parser.parse(row['spectralCount:dateTime']) observationDateTime = observationDateTime.strftime(SPECTRAL_COUNT_DATE_TIME_FORMAT) elif 'spectralCount:date' in row and row['spectralCount:date'] is not None: time = row.get('spectralCount:time', None) if (time is None): time = '00:00:00' observationDateTime = dateutil.parser.parse(row['spectralCount:date'] + 'T' + time) observationDateTime = observationDateTime.strftime(SPECTRAL_COUNT_DATE_TIME_FORMAT) spectralCount = { 'sampleId': row.get('spectralCount:sampleId', None), 'count': row.get('spectralCount:count', None), 'cruise': cruise, 'station': row.get('spectralCount:station', None), 'depth': row.get('spectralCount:depth', None), 'dateTime': observationDateTime, } if (spectralCount['depth'] is not None): if 'min' not in dataset_depth_stats: dataset_depth_stats['min'] = spectralCount['depth'] dataset_depth_stats['max'] = spectralCount['depth'] else: if spectralCount['depth'] < dataset_depth_stats['min']: dataset_depth_stats['min'] = spectralCount['depth'] if spectralCount['depth'] > dataset_depth_stats['max']: dataset_depth_stats['max'] = spectralCount['depth'] if (row['spectralCount:coordinate:lat'] is not None and row['spectralCount:coordinate:lon'] is not None): spectralCount['coordinate'] = { 'lat': row['spectralCount:coordinate:lat'], 'lon': row['spectralCount:coordinate:lon'] } data['spectralCount'].append(spectralCount) res = self.load(data=data, type='protein', id=data['guid']) logging.info(res['result']) # end of for loop of protein rows except Exception as e: logging.exception("Error with row[%s]: %s" % (row_count, keyed_row)) raise e
def test_iter(): table = Table(DATA_MIN, schema=SCHEMA_MIN) expect = [['one', 1], ['two', 2]] actual = list(table.iter())
def test_iter_web_csv(): table = Table(BASE_URL % 'data/data_infer.csv', schema=SCHEMA_CSV) expect = [[1, 39, 'Paul'], [2, 23, 'Jimmy'], [3, 36, 'Jane'], [4, 28, 'Judy']] actual = list(table.iter()) assert actual == expect
def test_iter_missing_cols_stream_closed(): table = Table('data/data_missing_cols.csv', schema=SCHEMA_MIN) with pytest.raises(exceptions.CastError) as excinfo: for _ in table.iter(): pass assert table._Table__stream.closed
def test_iter_keyed(): table = Table(DATA_MIN, schema=SCHEMA_MIN) expect = [{'key': 'one', 'value': 1}, {'key': 'two', 'value': 2}] actual = list(table.iter(keyed=True)) assert actual == expect
from tableschema import Table table = Table("template.csv", schema="schema.json") table.schema.valid # True for row in table.iter(): print(row)