Exemplo n.º 1
0
def test_parse_url_invalid_req_cols(input_df):
    expected_error = ValueError(
        "Given req_cols must be subset of %s" %
        (["hostname", "subdomain", "domain", "suffix"]))
    with pytest.raises(ValueError) as actual_error:
        dns.parse_url(input_df["url"], req_cols={"test"})
        assert actual_error == expected_error
Exemplo n.º 2
0
def test_parse_url(input_df):
    expected_output_df = DataFrame({
        "domain": [
            "google",
            "gmail",
            "github",
            "pydata",
            "worldbank",
            "waiterrant",
            "cnn",
            "cnn",
            "cnn",
            "news",
            "news",
            "news",
            "sbcglobal",
            "akamaitechnologies",
        ],
        "suffix": [
            "com",
            "com",
            "com",
            "org",
            "org.kg",
            "blogspot.com",
            "com.ac",
            "ac",
            "com",
            "uk",
            "co.uk",
            "co.uk",
            "net",
            "com",
        ],
    })
    output_df = dns.parse_url(input_df["url"], req_cols={"domain", "suffix"})

    for col in expected_output_df.columns:
        assert expected_output_df[col].equals(output_df[col])
Exemplo n.º 3
0
def test2_parse_url(input_df):
    expected_output_df = DataFrame({
        "hostname": [
            "www.google.com",
            "gmail.com",
            "github.com",
            "pandas.pydata.org",
            "www.worldbank.org.kg",
            "waiterrant.blogspot.com",
            "forums.news.cnn.com.ac",
            "forums.news.cnn.ac",
            "b.cnn.com",
            "a.news.uk",
            "a.news.co.uk",
            "a.news.co.uk",
            "107-193-100-2.lightspeed.cicril.sbcglobal.net",
            "a23-44-13-2.deploy.static.akamaitechnologies.com",
        ],
        "subdomain": [
            "www",
            "",
            "",
            "pandas",
            "www",
            "",
            "forums.news",
            "forums.news",
            "b",
            "a",
            "a",
            "a",
            "107-193-100-2.lightspeed.cicril",
            "a23-44-13-2.deploy.static",
        ],
        "domain": [
            "google",
            "gmail",
            "github",
            "pydata",
            "worldbank",
            "waiterrant",
            "cnn",
            "cnn",
            "cnn",
            "news",
            "news",
            "news",
            "sbcglobal",
            "akamaitechnologies",
        ],
        "suffix": [
            "com",
            "com",
            "com",
            "org",
            "org.kg",
            "blogspot.com",
            "com.ac",
            "ac",
            "com",
            "uk",
            "co.uk",
            "co.uk",
            "net",
            "com",
        ],
    })
    output_df = dns.parse_url(input_df["url"])

    assert expected_output_df.equals(output_df)