Exemplo n.º 1
0
    def check_headers(self, cells, sample):
        errors = []

        for cell in copy(cells):

            # Skip if cell has field
            if 'field' in cell:
                continue

            # Infer field
            if self.__infer_fields:
                column_sample = []
                for row in sample:
                    value = None
                    if len(row) >= cell['column-number']:
                        value = row[cell['column-number'] - 1]
                    column_sample.append([value])
                schema = Schema()
                schema.infer(column_sample, headers=[cell.get('header')])
                cell['field'] = schema.fields[0]

            # Add error/remove column
            else:
                error = Error('extra-header', cell)
                errors.append(error)
                cells.remove(cell)

        return errors
Exemplo n.º 2
0
def test_infer_schema():
    runner = CliRunner()
    result = runner.invoke(cli.infer, ['data/data_infer.csv'])
    # output is a string, evaluate to a dict
    schema = ast.literal_eval(result.output)
    schema_model = Schema(schema)
    assert schema_model.get_field('id').type == 'integer'
    assert schema_model.get_field('age').type == 'integer'
    assert schema_model.get_field('name').type == 'string'
Exemplo n.º 3
0
def test_infer_schema_utf8():
    """UTF8 encoded data containing non-ascii characters."""
    runner = CliRunner()
    result = runner.invoke(cli.infer, ['data/data_infer_utf8.csv'])
    # output is a string, evaluate to a dict
    schema = ast.literal_eval(result.output)
    schema_model = Schema(schema)
    assert schema_model.get_field('id').type == 'integer'
    assert schema_model.get_field('age').type == 'integer'
    assert schema_model.get_field('name').type == 'string'
Exemplo n.º 4
0
def test_infer():
    schema = Schema()
    schema.infer([
      ['id', 'age', 'name'],
      ['1','39','Paul'],
      ['2','23','Jimmy'],
      ['3','36','Jane'],
      ['4','N/A','Judy'],
    ])
    assert schema.descriptor == {
        'fields': [
            {'format': 'default', 'name': 'id', 'type': 'integer'},
            {'format': 'default', 'name': 'age', 'type': 'integer'},
            {'format': 'default', 'name': 'name', 'type': 'string'}],
        'missingValues': ['']}
Exemplo n.º 5
0
def test_cast_row_wrong_type_multiple_errors():
    schema = Schema(DESCRIPTOR_MAX)
    source = ['string', 'notdecimal', '10.6', 'string', 'string']
    with pytest.raises(exceptions.CastError) as excinfo:
        schema.cast_row(source)
    assert len(excinfo.value.errors) == 2
Exemplo n.º 6
0
def test_cast_row_too_long():
    schema = Schema(DESCRIPTOR_MAX)
    source = ['string', '10.0', '1', 'string', 'string', 'string']
    with pytest.raises(exceptions.CastError):
        schema.cast_row(source)
Exemplo n.º 7
0
def test_fields_have_public_backreference_to_schema():
    schema = Schema('data/schema_valid_full.json')
    assert schema.get_field('first_name').schema == schema
    assert schema.get_field('last_name').schema == schema
Exemplo n.º 8
0
import json
from frictionless import Package
from BorderScreama.frictionless_capture import (
    get_generated_sql,
    load_schema,
    to_package_descriptor,
)
from tableschema import Schema

articles = Schema(
    'https://raw.githubusercontent.com/frictionlessdata/datapackage-pipelines-sql-driver/master/data/articles.json'
)
package = Package(
    to_package_descriptor([{
        'name': 'articles',
        'schema': articles.descriptor
    }, 'https://raw.githubusercontent.com/frictionlessdata/datapackage-pipelines-sql-driver/master/data/comments.json'
                           ]))

assert get_generated_sql(package).strip() == '''\
CREATE TABLE articles (
	id INTEGER NOT NULL, 
	parent INTEGER, 
	name TEXT, 
	current BOOLEAN, 
	rating NUMERIC, 
	created_year DATE, 
	created_date DATE, 
	created_time TIME, 
	created_datetime DATETIME, 
	stats JSONB, 
Exemplo n.º 9
0
def test_cast_row():
    schema = Schema(DESCRIPTOR_MAX)
    source = ['string', '10.0', '1', 'string', 'string']
    target = ['string', Decimal(10.0), 1, 'string', 'string']
    assert schema.cast_row(source) == target
Exemplo n.º 10
0
def test_init_invalid_in_strict_mode():
    with pytest.raises(exceptions.TableSchemaException) as exception:
        Schema('data/schema_invalid_multiple_errors.json', strict=True)
Exemplo n.º 11
0
def test_descriptor_path(apply_defaults):
    path = 'data/schema_valid_simple.json'
    actual = Schema(path).descriptor
    with io.open(path, encoding='utf-8') as file:
        expect = apply_defaults(json.load(file))
    assert actual == expect
Exemplo n.º 12
0
def test_get_field():
    schema = Schema(DESCRIPTOR_MIN)
    assert schema.get_field('id').name == 'id'
    assert schema.get_field('height').name == 'height'
    assert schema.get_field('undefined') is None
Exemplo n.º 13
0
def test_has_field():
    schema = Schema(DESCRIPTOR_MIN)
    assert schema.has_field('id')
    assert schema.has_field('height')
    assert not schema.has_field('undefined')
Exemplo n.º 14
0
def test_cast_row_wrong_type_multiple_errors():
    schema = Schema(DESCRIPTOR_MAX)
    source = ['string', 'notdecimal', '10.6', 'string', 'string']
    with pytest.raises(exceptions.CastError) as excinfo:
        schema.cast_row(source)
    assert len(excinfo.value.errors) == 2
Exemplo n.º 15
0
def test_fields():
    expect = ['id', 'height']
    actual = [field.name for field in Schema(DESCRIPTOR_MIN).fields]
    assert expect == actual
Exemplo n.º 16
0
def test_cast_row_wrong_type():
    schema = Schema(DESCRIPTOR_MAX)
    source = ['string', 'notdecimal', '10.6', 'string', 'string']
    with pytest.raises(exceptions.CastError):
        schema.cast_row(source)
Exemplo n.º 17
0
def test_schema_instance(apply_defaults):
    schema_instance = Schema(SCHEMA_MIN)
    actual = Table(DATA_MIN, schema=schema_instance).schema.descriptor
    expect = apply_defaults(SCHEMA_MIN)
    assert actual == expect
Exemplo n.º 18
0
 def getSchema(self):
     s = Schema()
     return s.infer(self.data)
Exemplo n.º 19
0
def test_has_field():
    schema = Schema(DESCRIPTOR_MIN)
    assert schema.has_field('id')
    assert schema.has_field('height')
    assert not schema.has_field('undefined')
Exemplo n.º 20
0
def test_headers():
    assert Schema(DESCRIPTOR_MIN).headers == ['id', 'height']
Exemplo n.º 21
0
class TableExtractor(AbstractExtractor):
    """
    Extracts tables from HTML as structured content and plain text.
    """
    def __init__(self):
        self.__current_table_row = []
        self.__current_text = ''
        self.__is_table = False
        self.__is_table_head = False
        self.__is_table_body = False
        self.__table_content = None
        self.__table_stack = []
        self.__table_index = 1
        self.__is_anchor = False
        self.__anchor_text = ''
        self.__anchor_url = None
        self.schema = Schema()

    def extract(self,
                el,
                ev,
                structured_content: List[Dict[str, Any]],
                text_list: List[str],
                nlp=None):
        if el.tag == 'table':
            if ev == 'start':
                if self.__is_table:
                    ref = 'table:{}'.format(self.__table_index)
                    self.__current_text += f'{{{ref}}} '
                    self.__table_content.setdefault('references',
                                                    []).append(ref)
                    self.__table_stack.append(
                        (self.__current_table_row, self.__current_text,
                         self.__is_table_head, self.__is_table_body,
                         self.__table_content))
                self.__current_table_row = []
                self.__current_text = ''
                self.__is_table = True
                self.__is_table_head = False
                self.__is_table_body = False
                self.__table_content = {
                    'type': 'table',
                    'index': self.__table_index,
                    'head': [],
                    'body': []
                }
                self.__table_index += 1

            elif ev == 'end':
                table = self.__table_content
                if table['body']:
                    if table['head']:
                        headers = table['head']
                        fields = self.schema.infer(table['body'],
                                                   headers=headers)['fields']
                    else:
                        head = table['body'][0]
                        headers = [
                            'name%d' % (i + 1) for i in range(len(head))
                        ]
                        fields = self.schema.infer(table['body'],
                                                   headers=headers)['fields']
                        if len(table['body']) > 1:
                            dtypes = [field['type'] for field in fields]
                            if any([
                                    typ != guess_type(val)
                                    for typ, val in zip(dtypes, head)
                            ]):
                                table['head'] = [head]
                                table['body'] = table['body'][1:]
                                for field, name in zip(fields, head):
                                    field['name'] = name

                    table['fields'] = fields

                structured_content.append(table)
                if len(self.__table_stack):
                    (self.__current_table_row, self.__current_text,
                     self.__is_table_head, self.__is_table_body,
                     self.__table_content) = self.__table_stack.pop()
                else:
                    self.__is_table_body = False
                    self.__is_table_head = False
                    self.__is_table = False
                    self.__current_text = ''
                    self.__current_table_row = []
                    self.__table_content = None
                    self.__table_index = 1

        elif self.__is_table:
            # noinspection SpellCheckingInspection
            if el.tag == 'thead' and ev == 'start':
                self.__is_table_head = True
                self.__is_table_body = False

            elif el.tag == 'tbody' and ev == 'start':
                self.__is_table_head = False
                self.__is_table_body = True

            elif el.tag == 'tr' and ev == 'end':
                if self.__is_current_table_row_not_empty():
                    values = [v for _, v in self.__current_table_row]
                    text_list.append(strip_link_markers(r'\t'.join(values)))
                    if not self.__is_table_head and (
                            self.__is_table_body
                            or not self.__is_header_row()):
                        self.__table_content['body'].append(values)
                        self.__is_table_head = False
                        self.__is_table_body = True

                    else:
                        self.__table_content['head'].append(values)

                self.__current_text = ''
                self.__current_table_row = []

            elif el.tag == 'th':
                if ev == 'end':
                    self.__current_table_row.append(
                        ('th', clean_text(self.__current_text)))

                self.__current_text = ''

            elif el.tag == 'td':
                if ev == 'end':
                    self.__current_table_row.append(
                        ('td', clean_text(self.__current_text)))

                self.__current_text = ''

            elif el.tag == 'a':
                if ev == 'start':
                    anchor_url = el.get('href')
                    if anchor_url:
                        self.__is_anchor = True
                        self.__current_text += LINK_OPEN_MARKER
                        self.__anchor_url = el.get('href')

                elif ev == 'end' and self.__is_anchor:
                    self.__is_anchor = False
                    if self.__anchor_text.strip():
                        self.__current_text += LINK_CLOSE_MARKER
                        if self.__anchor_url and self.__anchor_text:
                            structured_content.append({
                                'type':
                                'link',
                                'url':
                                self.__anchor_url,
                                'text':
                                self.__anchor_text
                            })
                    else:
                        n = self.__current_text.rfind(LINK_OPEN_MARKER)
                        self.__current_text = self.__current_text[:n] + ' '

                    self.__anchor_url = None
                    self.__anchor_text = ''

            if ev == 'start' and el.text:
                self.__current_text += el.text
                if self.__is_anchor:
                    self.__anchor_text += el.text

            elif ev == 'end' and el.tail:
                self.__current_text += el.tail
                if self.__is_anchor:
                    self.__anchor_text += el.tail

    def __is_current_table_row_not_empty(self) -> bool:
        return any(v for _, v in self.__current_table_row)

    def __is_header_row(self) -> bool:
        return all(k == 'th' for k, _ in self.__current_table_row)
Exemplo n.º 22
0
def test_primary_key():
    assert Schema(DESCRIPTOR_MIN).primary_key == []
    assert Schema(DESCRIPTOR_MAX).primary_key == ['id']
Exemplo n.º 23
0
def test_descriptor(apply_defaults):
    assert Schema(DESCRIPTOR_MIN).descriptor == apply_defaults(DESCRIPTOR_MIN)
    assert Schema(DESCRIPTOR_MAX).descriptor == apply_defaults(DESCRIPTOR_MAX)
Exemplo n.º 24
0
def test_foreign_keys():
    assert Schema(DESCRIPTOR_MIN).foreign_keys == []
    assert Schema(DESCRIPTOR_MAX).foreign_keys == DESCRIPTOR_MAX['foreignKeys']
Exemplo n.º 25
0
def test_descriptor_url(apply_defaults):
    url = BASE_URL % 'data/schema_valid_simple.json'
    actual = Schema(url).descriptor
    expect = apply_defaults(requests.get(url).json())
    assert actual == expect
Exemplo n.º 26
0
def test_save(tmpdir, apply_defaults):
    path = str(tmpdir.join('schema.json'))
    Schema(DESCRIPTOR_MIN).save(path)
    with io.open(path, encoding='utf-8') as file:
        descriptor = json.load(file)
    assert descriptor == apply_defaults(DESCRIPTOR_MIN)
        def opener():
            _params = dict(headers=1)
            format = __resource.get("format")
            if format == "txt":
                # datapackage-pipelines processing requires having a header row
                # for txt format we add a single "data" column
                _params["headers"] = ["data"]
                _params["custom_parsers"] = {"txt": TXTParser}
                _params["allow_html"] = True
            else:
                if format is None:
                    _, format = tabulator.helpers.detect_scheme_and_format(
                        __url)
                if format in tabulator.config.SUPPORTED_COMPRESSION:
                    format = None
                else:
                    try:
                        parser_cls = tabulator.helpers.import_attribute(
                            tabulator.config.PARSERS[format])
                    except KeyError:
                        logging.error("Unknown format %r", format)
                        raise
                    _params.update(
                        dict(x for x in __resource.items()
                             if x[0] in parser_cls.options))
                _params.update(
                    dict(
                        x for x in __resource.items() if x[0] in {
                            'headers', 'scheme', 'encoding', 'sample_size',
                            'allow_html', 'force_strings', 'force_parse',
                            'skip_rows', 'compression'
                        }))
                if isinstance(_params.get('skip_rows'),
                              int):  # Backwards compatibility
                    _params['skip_rows'] = list(
                        range(1,
                              _params.get('skip_rows') + 1))

            if format is not None:
                _params['format'] = format

            if http_headers:
                http_session = requests.Session()
                http_session.headers = http_headers
                _params['http_session'] = http_session

            constants = _resource.get('constants', {})
            constant_headers = list(constants.keys())
            constant_values = [constants.get(k) for k in constant_headers]
            _stream = tabulator.Stream(__url,
                                       **_params,
                                       post_parse=[
                                           suffix_remover(format),
                                           add_constants(
                                               constant_headers,
                                               constant_values, _columns)
                                       ])
            retry = 0
            backoff = 2
            while True:
                try:
                    _stream.open()
                    _headers = dedupe(_stream.headers)
                    __columns = len(_headers)
                    _headers = dedupe(_headers + constant_headers)
                    _schema = __resource.get('schema')
                    if _schema is not None:
                        _schema = Schema(_schema)
                    return _schema, _headers, __columns, _stream, _stream.close
                except tabulator.exceptions.TabulatorException as e:
                    logging.warning(
                        "Error while opening resource from url %s: %r", _url,
                        e)
                    _stream.close()
                    retry += 1
                    if retry <= 3:
                        logging.warning("Retrying after %d seconds (%d/3)",
                                        backoff, retry)
                        time.sleep(backoff)
                        backoff *= 2
                        continue
                    else:
                        if not _ignore_missing:
                            raise
                        return {}, [], 0, [], lambda: None
Exemplo n.º 28
0
def test_infer():
    data = [
      ['id', 'age', 'name'],
      ['1','39','Paul'],
      ['2','23','Jimmy'],
      ['3','36','Jane'],
      ['4','N/A','Judy'],
    ]
    schema = Schema()
    schema.infer(data)
    assert schema.descriptor == {
        'fields': [
            {'format': 'default', 'name': 'id', 'type': 'integer'},
            {'format': 'default', 'name': 'age', 'type': 'integer'},
            {'format': 'default', 'name': 'name', 'type': 'string'}],
        'missingValues': ['']}
    data = [
      ['id', 'age', 'name'],
      ['1','39','Paul'],
      ['2','23','Jimmy'],
      ['3','36','Jane'],
      ['4','N/A','Judy'],
    ]
    schema = Schema()
    schema.infer(data, confidence=0.8)
    assert schema.descriptor == {
        'fields': [
            {'format': 'default', 'name': 'id', 'type': 'integer'},
            {'format': 'default', 'name': 'age', 'type': 'string'},
            {'format': 'default', 'name': 'name', 'type': 'string'}],
        'missingValues': ['']}
    
    class AllStrings():
        def cast(self, value):
            return [('string', 'default', 0)]
    data = [
      ['id', 'age', 'name'],
      ['1','39','Paul'],
      ['2','23','Jimmy'],
      ['3','36','Jane'],
      ['4','100','Judy'],
    ]

    schema = Schema()
    schema.infer(data, confidence=0.8, guesser_cls=AllStrings)
    assert schema.descriptor['fields'] == [
            {'format': 'default', 'name': 'id', 'type': 'string'},
            {'format': 'default', 'name': 'age', 'type': 'string'},
            {'format': 'default', 'name': 'name', 'type': 'string'}]
    assert schema.descriptor == {
        'fields': [
            {'format': 'default', 'name': 'id', 'type': 'string'},
            {'format': 'default', 'name': 'age', 'type': 'string'},
            {'format': 'default', 'name': 'name', 'type': 'string'}],
        'missingValues': ['']}
Exemplo n.º 29
0
def test_cast_row_null_values():
    schema = Schema(DESCRIPTOR_MAX)
    source = ['string', '', '-', 'string', 'null']
    target = ['string', None, None, 'string', None]
    assert schema.cast_row(source) == target
Exemplo n.º 30
0
def test_add_remove_field():
    schema = Schema()
    schema.add_field({'name': 'name'})
    field = schema.remove_field('name')
    assert field.name == 'name'
def _convert_schema(schema: t.Union[Schema, str, dict]):
    return Schema(schema.descriptor if isinstance(schema, Schema) else schema)
Exemplo n.º 32
0
def test_add_remove_field():
    schema = Schema()
    schema.add_field({'name': 'name'})
    field = schema.remove_field('name')
    assert field.name == 'name'
Exemplo n.º 33
0
def test_cast_row_wrong_type():
    schema = Schema(DESCRIPTOR_MAX)
    source = ['string', 'notdecimal', '10.6', 'string', 'string']
    with pytest.raises(exceptions.CastError):
        schema.cast_row(source)
Exemplo n.º 34
0
def test_cast_row_null_values():
    schema = Schema(DESCRIPTOR_MAX)
    source = ['string', '', '-', 'string', 'null']
    target = ['string', None, None, 'string', None]
    assert schema.cast_row(source) == target
Exemplo n.º 35
0
def test_get_field():
    schema = Schema(DESCRIPTOR_MIN)
    assert schema.get_field('id').name == 'id'
    assert schema.get_field('height').name == 'height'
    assert schema.get_field('undefined') is None
Exemplo n.º 36
0
def get_all_schema(schemas_dir=SCHEMAS_DIR) -> List[Schema]:
    return [Schema(schema_path) for schema_path in get_all_schema_path(schemas_dir)]
Exemplo n.º 37
0
def test_schema_field_date_format_issue_177():
    descriptor = {'fields':[{'name':'myfield', 'type':'date', 'format':'%d/%m/%y'}]}
    schema = Schema(descriptor)
    assert schema
def get_schema(schema_filename: str) -> Schema:
    return Schema(schema_filename)
Exemplo n.º 39
0
def test_cast_row():
    schema = Schema(DESCRIPTOR_MAX)
    source = ['string', '10.0', '1', 'string', 'string']
    target = ['string', Decimal(10.0), 1, 'string', 'string']
    assert schema.cast_row(source) == target
Exemplo n.º 40
0
def test_schema_field_time_format_issue_177():
    descriptor = {'fields':[{'name':'myfield', 'type':'time', 'format':'%H:%M:%S'}]}
    schema = Schema(descriptor)
    assert schema
Exemplo n.º 41
0
def test_cast_row_too_long():
    schema = Schema(DESCRIPTOR_MAX)
    source = ['string', '10.0', '1', 'string', 'string', 'string']
    with pytest.raises(exceptions.CastError):
        schema.cast_row(source)
Exemplo n.º 42
0
def test_init():
    assert Schema(DESCRIPTOR_MIN)
    assert Schema(DESCRIPTOR_MAX)
    assert Schema('data/schema_valid_full.json')
    assert Schema('data/schema_valid_simple.json')