def check_headers(self, cells, sample): errors = [] for cell in copy(cells): # Skip if cell has field if 'field' in cell: continue # Infer field if self.__infer_fields: column_sample = [] for row in sample: value = None if len(row) >= cell['column-number']: value = row[cell['column-number'] - 1] column_sample.append([value]) schema = Schema() schema.infer(column_sample, headers=[cell.get('header')]) cell['field'] = schema.fields[0] # Add error/remove column else: error = Error('extra-header', cell) errors.append(error) cells.remove(cell) return errors
def test_infer_schema(): runner = CliRunner() result = runner.invoke(cli.infer, ['data/data_infer.csv']) # output is a string, evaluate to a dict schema = ast.literal_eval(result.output) schema_model = Schema(schema) assert schema_model.get_field('id').type == 'integer' assert schema_model.get_field('age').type == 'integer' assert schema_model.get_field('name').type == 'string'
def test_infer_schema_utf8(): """UTF8 encoded data containing non-ascii characters.""" runner = CliRunner() result = runner.invoke(cli.infer, ['data/data_infer_utf8.csv']) # output is a string, evaluate to a dict schema = ast.literal_eval(result.output) schema_model = Schema(schema) assert schema_model.get_field('id').type == 'integer' assert schema_model.get_field('age').type == 'integer' assert schema_model.get_field('name').type == 'string'
def test_infer(): schema = Schema() schema.infer([ ['id', 'age', 'name'], ['1','39','Paul'], ['2','23','Jimmy'], ['3','36','Jane'], ['4','N/A','Judy'], ]) assert schema.descriptor == { 'fields': [ {'format': 'default', 'name': 'id', 'type': 'integer'}, {'format': 'default', 'name': 'age', 'type': 'integer'}, {'format': 'default', 'name': 'name', 'type': 'string'}], 'missingValues': ['']}
def test_cast_row_wrong_type_multiple_errors(): schema = Schema(DESCRIPTOR_MAX) source = ['string', 'notdecimal', '10.6', 'string', 'string'] with pytest.raises(exceptions.CastError) as excinfo: schema.cast_row(source) assert len(excinfo.value.errors) == 2
def test_cast_row_too_long(): schema = Schema(DESCRIPTOR_MAX) source = ['string', '10.0', '1', 'string', 'string', 'string'] with pytest.raises(exceptions.CastError): schema.cast_row(source)
def test_fields_have_public_backreference_to_schema(): schema = Schema('data/schema_valid_full.json') assert schema.get_field('first_name').schema == schema assert schema.get_field('last_name').schema == schema
import json from frictionless import Package from BorderScreama.frictionless_capture import ( get_generated_sql, load_schema, to_package_descriptor, ) from tableschema import Schema articles = Schema( 'https://raw.githubusercontent.com/frictionlessdata/datapackage-pipelines-sql-driver/master/data/articles.json' ) package = Package( to_package_descriptor([{ 'name': 'articles', 'schema': articles.descriptor }, 'https://raw.githubusercontent.com/frictionlessdata/datapackage-pipelines-sql-driver/master/data/comments.json' ])) assert get_generated_sql(package).strip() == '''\ CREATE TABLE articles ( id INTEGER NOT NULL, parent INTEGER, name TEXT, current BOOLEAN, rating NUMERIC, created_year DATE, created_date DATE, created_time TIME, created_datetime DATETIME, stats JSONB,
def test_cast_row(): schema = Schema(DESCRIPTOR_MAX) source = ['string', '10.0', '1', 'string', 'string'] target = ['string', Decimal(10.0), 1, 'string', 'string'] assert schema.cast_row(source) == target
def test_init_invalid_in_strict_mode(): with pytest.raises(exceptions.TableSchemaException) as exception: Schema('data/schema_invalid_multiple_errors.json', strict=True)
def test_descriptor_path(apply_defaults): path = 'data/schema_valid_simple.json' actual = Schema(path).descriptor with io.open(path, encoding='utf-8') as file: expect = apply_defaults(json.load(file)) assert actual == expect
def test_get_field(): schema = Schema(DESCRIPTOR_MIN) assert schema.get_field('id').name == 'id' assert schema.get_field('height').name == 'height' assert schema.get_field('undefined') is None
def test_has_field(): schema = Schema(DESCRIPTOR_MIN) assert schema.has_field('id') assert schema.has_field('height') assert not schema.has_field('undefined')
def test_fields(): expect = ['id', 'height'] actual = [field.name for field in Schema(DESCRIPTOR_MIN).fields] assert expect == actual
def test_cast_row_wrong_type(): schema = Schema(DESCRIPTOR_MAX) source = ['string', 'notdecimal', '10.6', 'string', 'string'] with pytest.raises(exceptions.CastError): schema.cast_row(source)
def test_schema_instance(apply_defaults): schema_instance = Schema(SCHEMA_MIN) actual = Table(DATA_MIN, schema=schema_instance).schema.descriptor expect = apply_defaults(SCHEMA_MIN) assert actual == expect
def getSchema(self): s = Schema() return s.infer(self.data)
def test_headers(): assert Schema(DESCRIPTOR_MIN).headers == ['id', 'height']
class TableExtractor(AbstractExtractor): """ Extracts tables from HTML as structured content and plain text. """ def __init__(self): self.__current_table_row = [] self.__current_text = '' self.__is_table = False self.__is_table_head = False self.__is_table_body = False self.__table_content = None self.__table_stack = [] self.__table_index = 1 self.__is_anchor = False self.__anchor_text = '' self.__anchor_url = None self.schema = Schema() def extract(self, el, ev, structured_content: List[Dict[str, Any]], text_list: List[str], nlp=None): if el.tag == 'table': if ev == 'start': if self.__is_table: ref = 'table:{}'.format(self.__table_index) self.__current_text += f'{{{ref}}} ' self.__table_content.setdefault('references', []).append(ref) self.__table_stack.append( (self.__current_table_row, self.__current_text, self.__is_table_head, self.__is_table_body, self.__table_content)) self.__current_table_row = [] self.__current_text = '' self.__is_table = True self.__is_table_head = False self.__is_table_body = False self.__table_content = { 'type': 'table', 'index': self.__table_index, 'head': [], 'body': [] } self.__table_index += 1 elif ev == 'end': table = self.__table_content if table['body']: if table['head']: headers = table['head'] fields = self.schema.infer(table['body'], headers=headers)['fields'] else: head = table['body'][0] headers = [ 'name%d' % (i + 1) for i in range(len(head)) ] fields = self.schema.infer(table['body'], headers=headers)['fields'] if len(table['body']) > 1: dtypes = [field['type'] for field in fields] if any([ typ != guess_type(val) for typ, val in zip(dtypes, head) ]): table['head'] = [head] table['body'] = table['body'][1:] for field, name in zip(fields, head): field['name'] = name table['fields'] = fields structured_content.append(table) if len(self.__table_stack): (self.__current_table_row, self.__current_text, self.__is_table_head, self.__is_table_body, self.__table_content) = self.__table_stack.pop() else: self.__is_table_body = False self.__is_table_head = False self.__is_table = False self.__current_text = '' self.__current_table_row = [] self.__table_content = None self.__table_index = 1 elif self.__is_table: # noinspection SpellCheckingInspection if el.tag == 'thead' and ev == 'start': self.__is_table_head = True self.__is_table_body = False elif el.tag == 'tbody' and ev == 'start': self.__is_table_head = False self.__is_table_body = True elif el.tag == 'tr' and ev == 'end': if self.__is_current_table_row_not_empty(): values = [v for _, v in self.__current_table_row] text_list.append(strip_link_markers(r'\t'.join(values))) if not self.__is_table_head and ( self.__is_table_body or not self.__is_header_row()): self.__table_content['body'].append(values) self.__is_table_head = False self.__is_table_body = True else: self.__table_content['head'].append(values) self.__current_text = '' self.__current_table_row = [] elif el.tag == 'th': if ev == 'end': self.__current_table_row.append( ('th', clean_text(self.__current_text))) self.__current_text = '' elif el.tag == 'td': if ev == 'end': self.__current_table_row.append( ('td', clean_text(self.__current_text))) self.__current_text = '' elif el.tag == 'a': if ev == 'start': anchor_url = el.get('href') if anchor_url: self.__is_anchor = True self.__current_text += LINK_OPEN_MARKER self.__anchor_url = el.get('href') elif ev == 'end' and self.__is_anchor: self.__is_anchor = False if self.__anchor_text.strip(): self.__current_text += LINK_CLOSE_MARKER if self.__anchor_url and self.__anchor_text: structured_content.append({ 'type': 'link', 'url': self.__anchor_url, 'text': self.__anchor_text }) else: n = self.__current_text.rfind(LINK_OPEN_MARKER) self.__current_text = self.__current_text[:n] + ' ' self.__anchor_url = None self.__anchor_text = '' if ev == 'start' and el.text: self.__current_text += el.text if self.__is_anchor: self.__anchor_text += el.text elif ev == 'end' and el.tail: self.__current_text += el.tail if self.__is_anchor: self.__anchor_text += el.tail def __is_current_table_row_not_empty(self) -> bool: return any(v for _, v in self.__current_table_row) def __is_header_row(self) -> bool: return all(k == 'th' for k, _ in self.__current_table_row)
def test_primary_key(): assert Schema(DESCRIPTOR_MIN).primary_key == [] assert Schema(DESCRIPTOR_MAX).primary_key == ['id']
def test_descriptor(apply_defaults): assert Schema(DESCRIPTOR_MIN).descriptor == apply_defaults(DESCRIPTOR_MIN) assert Schema(DESCRIPTOR_MAX).descriptor == apply_defaults(DESCRIPTOR_MAX)
def test_foreign_keys(): assert Schema(DESCRIPTOR_MIN).foreign_keys == [] assert Schema(DESCRIPTOR_MAX).foreign_keys == DESCRIPTOR_MAX['foreignKeys']
def test_descriptor_url(apply_defaults): url = BASE_URL % 'data/schema_valid_simple.json' actual = Schema(url).descriptor expect = apply_defaults(requests.get(url).json()) assert actual == expect
def test_save(tmpdir, apply_defaults): path = str(tmpdir.join('schema.json')) Schema(DESCRIPTOR_MIN).save(path) with io.open(path, encoding='utf-8') as file: descriptor = json.load(file) assert descriptor == apply_defaults(DESCRIPTOR_MIN)
def opener(): _params = dict(headers=1) format = __resource.get("format") if format == "txt": # datapackage-pipelines processing requires having a header row # for txt format we add a single "data" column _params["headers"] = ["data"] _params["custom_parsers"] = {"txt": TXTParser} _params["allow_html"] = True else: if format is None: _, format = tabulator.helpers.detect_scheme_and_format( __url) if format in tabulator.config.SUPPORTED_COMPRESSION: format = None else: try: parser_cls = tabulator.helpers.import_attribute( tabulator.config.PARSERS[format]) except KeyError: logging.error("Unknown format %r", format) raise _params.update( dict(x for x in __resource.items() if x[0] in parser_cls.options)) _params.update( dict( x for x in __resource.items() if x[0] in { 'headers', 'scheme', 'encoding', 'sample_size', 'allow_html', 'force_strings', 'force_parse', 'skip_rows', 'compression' })) if isinstance(_params.get('skip_rows'), int): # Backwards compatibility _params['skip_rows'] = list( range(1, _params.get('skip_rows') + 1)) if format is not None: _params['format'] = format if http_headers: http_session = requests.Session() http_session.headers = http_headers _params['http_session'] = http_session constants = _resource.get('constants', {}) constant_headers = list(constants.keys()) constant_values = [constants.get(k) for k in constant_headers] _stream = tabulator.Stream(__url, **_params, post_parse=[ suffix_remover(format), add_constants( constant_headers, constant_values, _columns) ]) retry = 0 backoff = 2 while True: try: _stream.open() _headers = dedupe(_stream.headers) __columns = len(_headers) _headers = dedupe(_headers + constant_headers) _schema = __resource.get('schema') if _schema is not None: _schema = Schema(_schema) return _schema, _headers, __columns, _stream, _stream.close except tabulator.exceptions.TabulatorException as e: logging.warning( "Error while opening resource from url %s: %r", _url, e) _stream.close() retry += 1 if retry <= 3: logging.warning("Retrying after %d seconds (%d/3)", backoff, retry) time.sleep(backoff) backoff *= 2 continue else: if not _ignore_missing: raise return {}, [], 0, [], lambda: None
def test_infer(): data = [ ['id', 'age', 'name'], ['1','39','Paul'], ['2','23','Jimmy'], ['3','36','Jane'], ['4','N/A','Judy'], ] schema = Schema() schema.infer(data) assert schema.descriptor == { 'fields': [ {'format': 'default', 'name': 'id', 'type': 'integer'}, {'format': 'default', 'name': 'age', 'type': 'integer'}, {'format': 'default', 'name': 'name', 'type': 'string'}], 'missingValues': ['']} data = [ ['id', 'age', 'name'], ['1','39','Paul'], ['2','23','Jimmy'], ['3','36','Jane'], ['4','N/A','Judy'], ] schema = Schema() schema.infer(data, confidence=0.8) assert schema.descriptor == { 'fields': [ {'format': 'default', 'name': 'id', 'type': 'integer'}, {'format': 'default', 'name': 'age', 'type': 'string'}, {'format': 'default', 'name': 'name', 'type': 'string'}], 'missingValues': ['']} class AllStrings(): def cast(self, value): return [('string', 'default', 0)] data = [ ['id', 'age', 'name'], ['1','39','Paul'], ['2','23','Jimmy'], ['3','36','Jane'], ['4','100','Judy'], ] schema = Schema() schema.infer(data, confidence=0.8, guesser_cls=AllStrings) assert schema.descriptor['fields'] == [ {'format': 'default', 'name': 'id', 'type': 'string'}, {'format': 'default', 'name': 'age', 'type': 'string'}, {'format': 'default', 'name': 'name', 'type': 'string'}] assert schema.descriptor == { 'fields': [ {'format': 'default', 'name': 'id', 'type': 'string'}, {'format': 'default', 'name': 'age', 'type': 'string'}, {'format': 'default', 'name': 'name', 'type': 'string'}], 'missingValues': ['']}
def test_cast_row_null_values(): schema = Schema(DESCRIPTOR_MAX) source = ['string', '', '-', 'string', 'null'] target = ['string', None, None, 'string', None] assert schema.cast_row(source) == target
def test_add_remove_field(): schema = Schema() schema.add_field({'name': 'name'}) field = schema.remove_field('name') assert field.name == 'name'
def _convert_schema(schema: t.Union[Schema, str, dict]): return Schema(schema.descriptor if isinstance(schema, Schema) else schema)
def get_all_schema(schemas_dir=SCHEMAS_DIR) -> List[Schema]: return [Schema(schema_path) for schema_path in get_all_schema_path(schemas_dir)]
def test_schema_field_date_format_issue_177(): descriptor = {'fields':[{'name':'myfield', 'type':'date', 'format':'%d/%m/%y'}]} schema = Schema(descriptor) assert schema
def get_schema(schema_filename: str) -> Schema: return Schema(schema_filename)
def test_schema_field_time_format_issue_177(): descriptor = {'fields':[{'name':'myfield', 'type':'time', 'format':'%H:%M:%S'}]} schema = Schema(descriptor) assert schema
def test_init(): assert Schema(DESCRIPTOR_MIN) assert Schema(DESCRIPTOR_MAX) assert Schema('data/schema_valid_full.json') assert Schema('data/schema_valid_simple.json')