Exemplo n.º 1
0
def test_xlsx_parser_merged_cells_boolean():
    source = "data/merged-cells-boolean.xls"
    layout = Layout(header=False)
    with Resource(source, layout=layout) as resource:
        assert resource.read_rows() == [
            {
                "field1": True,
                "field2": None
            },
            {
                "field1": None,
                "field2": None
            },
            {
                "field1": None,
                "field2": None
            },
        ]
def test_ods_parser_write(tmpdir):
    source = Resource("data/table.csv")
    # NOTE: ezodf writer creates more cells than we ask (remove limits)
    layout = Layout(limit_fields=2, limit_rows=2)
    target = Resource(str(tmpdir.join("table.ods")), layout=layout)
    source.write(target)
    with target:
        assert target.header == ["id", "name"]
        assert target.read_rows() == [
            {
                "id": 1,
                "name": "english"
            },
            {
                "id": 2,
                "name": "中国人"
            },
        ]
Exemplo n.º 3
0
def test_sql_parser_headers_false(database_url):
    dialect = SqlDialect(table="table")
    layout = Layout(header=False)
    with Resource(database_url, dialect=dialect, layout=layout) as resource:
        assert resource.header == ["id", "name"]
        assert resource.read_rows() == [
            {
                "id": None,
                "name": "name"
            },
            {
                "id": 1,
                "name": "english"
            },
            {
                "id": 2,
                "name": "中国人"
            },
        ]
Exemplo n.º 4
0
def test_xls_parser_merged_cells_fill():
    source = "data/merged-cells.xls"
    dialect = ExcelDialect(fill_merged_cells=True)
    layout = Layout(header=False)
    with Resource(source, dialect=dialect, layout=layout) as resource:
        assert resource.read_rows() == [
            {
                "field1": "data",
                "field2": "data"
            },
            {
                "field1": "data",
                "field2": "data"
            },
            {
                "field1": "data",
                "field2": "data"
            },
        ]
def test_resource_layout_header_xlsx_multiline():
    source = "data/multiline-headers.xlsx"
    dialect = ExcelDialect(fill_merged_cells=True)
    layout = Layout(header_rows=[1, 2, 3, 4, 5])
    with Resource(source, dialect=dialect, layout=layout) as resource:
        header = resource.header
        assert header == [
            "Region",
            "Caloric contribution (%)",
            "Cumulative impact of changes on cost of food basket from previous quarter",
            "Cumulative impact of changes on cost of food basket from baseline (%)",
        ]
        assert resource.read_rows() == [
            {
                header[0]: "A",
                header[1]: "B",
                header[2]: "C",
                header[3]: "D"
            },
        ]
def test_resource_layout_header_inline_keyed_headers_is_none():
    source = [{"id": "1", "name": "english"}, {"id": "2", "name": "中国人"}]
    layout = Layout(header=False)
    with Resource(source, layout=layout) as resource:
        assert resource.labels == []
        assert resource.header == ["field1", "field2"]
        assert resource.read_rows() == [
            {
                "field1": "id",
                "field2": "name"
            },
            {
                "field1": "1",
                "field2": "english"
            },
            {
                "field1": "2",
                "field2": "中国人"
            },
        ]
def test_resource_layout_skip_rows_regex():
    source = [
        ["# comment"],
        ["name", "order"],
        ["# cat"],
        ["# dog"],
        ["John", 1],
        ["Alex", 2],
    ]
    layout = Layout(skip_rows=["# comment", r"<regex># (cat|dog)"])
    with Resource(source, layout=layout) as resource:
        assert resource.header == ["name", "order"]
        assert resource.read_rows() == [
            {
                "name": "John",
                "order": 1
            },
            {
                "name": "Alex",
                "order": 2
            },
        ]
Exemplo n.º 8
0
def test_xlsx_parser_preserve_formatting():
    source = "data/preserve-formatting.xlsx"
    dialect = ExcelDialect(preserve_formatting=True)
    layout = Layout(header_rows=[1])
    with Resource(source, dialect=dialect, layout=layout, infer_type="any") as resource:
        assert resource.read_rows() == [
            {
                # general
                "empty": None,
                # numeric
                "0": "1001",
                "0.00": "1000.56",
                "0.0000": "1000.5577",
                "0.00000": "1000.55770",
                "0.0000#": "1000.5577",
                # temporal
                "m/d/yy": "5/20/40",
                "d-mmm": "20-May",
                "mm/dd/yy": "05/20/40",
                "mmddyy": "052040",
                "mmddyyam/pmdd": "052040AM20",
            }
        ]
def test_resource_layout_skip_rows_preset():
    source = [
        ["name", "order"],
        ["", ""],
        [],
        ["Ray", 0],
        ["John", 1],
        ["Alex", 2],
        ["", 3],
        [None, 4],
        ["", None],
    ]
    layout = Layout(skip_rows=["<blank>"])
    with Resource(source, layout=layout) as resource:
        assert resource.header == ["name", "order"]
        assert resource.read_rows() == [
            {
                "name": "Ray",
                "order": 0
            },
            {
                "name": "John",
                "order": 1
            },
            {
                "name": "Alex",
                "order": 2
            },
            {
                "name": None,
                "order": 3
            },
            {
                "name": None,
                "order": 4
            },
        ]
Exemplo n.º 10
0
def test_validate_order_fields_issue_313():
    source = "data/issue-313.xlsx"
    layout = Layout(pick_fields=[1, 2, 3, 4, 5])
    schema = {
        "fields": [
            {
                "name": "Column_1",
                "type": "string"
            },
            {
                "name": "Column_2",
                "type": "string",
                "constraints": {
                    "required": True
                }
            },
            {
                "name": "Column_3",
                "type": "string"
            },
            {
                "name": "Column_4",
                "type": "string"
            },
            {
                "name": "Column_5",
                "type": "string"
            },
        ]
    }
    detector = Detector(schema_sync=True)
    resource = Resource(source,
                        layout=layout,
                        schema=schema,
                        detector=detector)
    report = resource.validate()
    assert report.valid
Exemplo n.º 11
0
def test_validate_layout_number():
    layout = Layout(header_rows=[2])
    report = validate("data/matrix.csv", layout=layout)
    assert report.task.resource.header == ["11", "12", "13", "14"]
    assert report.valid
Exemplo n.º 12
0
def test_package_resources_respect_layout_set_after_creation_issue_503():
    package = Package(resources=[Resource(path="data/table.csv")])
    resource = package.get_resource("table")
    resource.layout = Layout(limit_rows=1)
    assert resource.read_rows() == [{"id": 1, "name": "english"}]
    assert resource.header == ["id", "name"]
Exemplo n.º 13
0
def test_validate_offset_rows():
    layout = Layout(offset_rows=3)
    report = validate("data/matrix.csv", layout=layout)
    assert report.task.resource.header == ["f1", "f2", "f3", "f4"]
    assert report.task.resource.stats["rows"] == 1
    assert report.task.valid
Exemplo n.º 14
0
def test_validate_layout_structure_errors_with_limit_rows():
    layout = Layout(limit_rows=3)
    report = validate("data/structure-errors.csv", layout=layout)
    assert report.flatten(["rowPosition", "fieldPosition", "code"]) == [
        [4, None, "blank-row"],
    ]
Exemplo n.º 15
0
def test_validate_layout_limit_and_offset_rows():
    layout = Layout(limit_rows=2, offset_rows=1)
    report = validate("data/matrix.csv", layout=layout)
    assert report.task.resource.header == ["f1", "f2", "f3", "f4"]
    assert report.task.resource.stats["rows"] == 2
    assert report.task.valid
Exemplo n.º 16
0
def test_validate_layout_skip_rows_and_fields():
    layout = Layout(skip_rows=[2, "41"], skip_fields=[1, "f4"])
    report = validate("data/matrix.csv", layout=layout)
    assert report.task.resource.header == ["f2", "f3"]
    assert report.task.resource.stats["rows"] == 2
    assert report.task.valid
Exemplo n.º 17
0
def test_validate_layout_skip_rows_blank():
    layout = Layout(skip_rows=["<blank>"])
    report = validate("data/blank-rows.csv", layout=layout)
    assert report.task.resource.header == ["id", "name", "age"]
    assert report.task.resource.stats["rows"] == 2
    assert report.task.valid
Exemplo n.º 18
0
def test_validate_layout_skip_rows_regex():
    layout = Layout(skip_rows=["<regex>[14]1"])
    report = validate("data/matrix.csv", layout=layout)
    assert report.task.resource.header == ["f1", "f2", "f3", "f4"]
    assert report.task.resource.stats["rows"] == 2
    assert report.task.valid
Exemplo n.º 19
0
def test_validate_layout_pick_rows():
    layout = Layout(pick_rows=[1, 3, "31"])
    report = validate("data/matrix.csv", layout=layout)
    assert report.task.resource.header == ["f1", "f2", "f3", "f4"]
    assert report.task.resource.stats["rows"] == 2
    assert report.task.valid
Exemplo n.º 20
0
def load_resource_via_api(resource_dict, ckan_api_key, ckan_site_url):
    logging.info("Loading resource via API lib")
    try:
        offset_rows = 0
        row_chunk = 16384

        # Push data untill records is empty
        while True:
            if offset_rows == 0:
                layout = Layout(limit_rows=row_chunk)
            else:
                layout = Layout(limit_rows=row_chunk, offset_rows=offset_rows)
            with Resource(resource_dict['path'], layout=layout) as resource:
                records = [
                    row.to_dict(json=True) for row in resource.row_stream
                ]
                if not records:
                    status_dict = {
                        'res_id':
                        resource_dict['ckan_resource_id'],
                        'state':
                        'complete',
                        'message':
                        'Successfully pushed {0} entries to "{1}"'.format(
                            offset_rows, resource_dict['ckan_resource_id'])
                    }
                    aircan_status_update(ckan_site_url, ckan_api_key,
                                         status_dict)
                    return {'success': True}
                else:
                    offset_rows += len(records)
                    payload = {
                        'resource_id': resource_dict['ckan_resource_id'],
                        'force': True,
                        'records': records,
                        'method': 'insert'
                    }
                    url = urljoin(ckan_site_url,
                                  '/api/3/action/datastore_upsert')
                    response = requests.post(
                        url,
                        data=json.dumps(payload, cls=DatastoreEncoder),
                        headers={
                            'Content-Type': 'application/json',
                            'Authorization': ckan_api_key
                        })
                    response.raise_for_status()
                    if response.status_code == 200:
                        status_dict = {
                            'res_id':
                            resource_dict['ckan_resource_id'],
                            'state':
                            'complete',
                            'message':
                            'Pushed {0} entries of records.'.format(
                                offset_rows)
                        }
                        aircan_status_update(ckan_site_url, ckan_api_key,
                                             status_dict)
                    else:
                        raise requests.HTTPError(
                            'Failed to make request on CKAN API.')
    except Exception as err:
        status_dict = {
            'res_id': resource_dict['ckan_resource_id'],
            'state': 'error',
            'message': 'Failed to push data into datastore DB.',
            'error': str(err)
        }
        aircan_status_update(ckan_site_url, ckan_api_key, status_dict)
        return {"success": False}
Exemplo n.º 21
0
def test_validate_layout_list_of_numbers_and_headers_join():
    layout = Layout(header_rows=[2, 3, 4], header_join=".")
    report = validate("data/matrix.csv", layout=layout)
    assert report.task.resource.header == ["11.21.31", "12.22.32", "13.23.33", "14.24.34"]
    assert report.valid
Exemplo n.º 22
0
def test_validate_layout_list_of_numbers():
    layout = Layout(header_rows=[2, 3, 4])
    report = validate("data/matrix.csv", layout=layout)
    assert report.task.resource.header == ["11 21 31", "12 22 32", "13 23 33", "14 24 34"]
    assert report.valid
Exemplo n.º 23
0
def test_resource_stats_rows_significant():
    layout = Layout(header=False)
    with Resource("data/table-1MB.csv", layout=layout) as resource:
        print(resource.read_rows())
        assert resource.stats["rows"] == 10000
Exemplo n.º 24
0
def test_validate_layout_pick_fields_regex():
    layout = Layout(pick_fields=["<regex>f[23]"])
    report = validate("data/matrix.csv", layout=layout)
    assert report.task.resource.header == ["f2", "f3"]
    assert report.task.resource.stats["rows"] == 4
    assert report.task.valid
Exemplo n.º 25
0
def test_validate_layout_offset_fields():
    layout = Layout(offset_fields=3)
    report = validate("data/matrix.csv", layout=layout)
    assert report.task.resource.header == ["f4"]
    assert report.task.resource.stats["rows"] == 4
    assert report.task.valid
Exemplo n.º 26
0
def test_resource_layout_respect_set_after_creation_issue_503():
    resource = Resource(path="data/table.csv")
    resource.layout = Layout(limit_rows=1)
    assert resource.read_rows() == [{"id": 1, "name": "english"}]
    assert resource.header == ["id", "name"]
Exemplo n.º 27
0
def test_describe_resource_schema_check_type_boolean_string_tie():
    layout = Layout(header=False)
    detector = Detector(field_names=["field"])
    resource = describe([["f"], ["stringish"]], layout=layout, detector=detector)
    assert resource.schema.get_field("field").type == "string"