Пример #1
0
def load_names():
    """Loads names used for renaming the banks

    Returns
    -------
    Legate DataFrame
    """

    cols = ["seller_name", "new"]

    dtypes = OrderedDict([("seller_name", "category"), ("new", "category")])

    out_pd = pd.read_csv(
        col_names_path,
        names=cols,
        delimiter="|",
        dtype=dtypes,
        skiprows=1,
    )

    out_lp = lp.read_csv(
        col_names_path,
        names=cols,
        delimiter="|",
        dtype=dtypes,
        skiprows=1,
    )

    assert equals(out_lp, out_pd)
Пример #2
0
def load_acquisition_csv():
    """Loads acquisition data

    Returns
    -------
    Legate DataFrame
    """

    cols = [
        "loan_id",
        "orig_channel",
        "seller_name",
        "orig_interest_rate",
        "orig_upb",
        "orig_loan_term",
        "orig_date",
        "first_pay_date",
        "orig_ltv",
        "orig_cltv",
        "num_borrowers",
        "dti",
        "borrower_credit_score",
        "first_home_buyer",
        "loan_purpose",
        "property_type",
        "num_units",
        "occupancy_status",
        "property_state",
        "zip",
        "mortgage_insurance_percent",
        "product_type",
        "coborrow_credit_score",
        "mortgage_insurance_type",
        "relocation_mortgage_indicator",
        "quarter",
    ]

    dtypes = OrderedDict([
        ("loan_id", "int64"),
        ("orig_channel", "category"),
        ("seller_name", "category"),
        ("orig_interest_rate", "float64"),
        ("orig_upb", "int64"),
        ("orig_loan_term", "int64"),
        ("orig_date", "str"),
        ("first_pay_date", "str"),
        ("orig_ltv", "float64"),
        ("orig_cltv", "float64"),
        ("num_borrowers", "float64"),
        ("dti", "float64"),
        ("borrower_credit_score", "float64"),
        ("first_home_buyer", "category"),
        ("loan_purpose", "category"),
        ("property_type", "category"),
        ("num_units", "int64"),
        ("occupancy_status", "category"),
        ("property_state", "category"),
        ("zip", "int64"),
        ("mortgage_insurance_percent", "float64"),
        ("product_type", "category"),
        ("coborrow_credit_score", "float64"),
        ("mortgage_insurance_type", "float64"),
        ("relocation_mortgage_indicator", "category"),
        ("quarter", "int32"),
    ])

    def dateparse(x):
        return datetime.strptime(x, "%m/%Y")

    out_pd = pd.read_csv(
        acq_data_path,
        names=cols,
        delimiter="|",
        index_col=False,
        dtype=dtypes,
        parse_dates=["orig_date", "first_pay_date"],
        date_parser=dateparse,
    )

    out_lp = lp.read_csv(
        acq_data_path,
        names=cols,
        delimiter="|",
        index_col=False,
        dtype=dtypes,
        parse_dates=["orig_date", "first_pay_date"],
    )

    assert equals(out_lp, out_pd)
Пример #3
0
paths1 = [
    os.path.join(base, "read_csv_datetime.csv"),
    os.path.join(base, "read_csv_datetime.csv.gz"),
    os.path.join(base, "read_csv_datetime.csv.bz2"),
]
paths2 = [
    os.path.join(base, "read_csv_category.csv"),
    os.path.join(base, "read_csv_category.csv.gz"),
    os.path.join(base, "read_csv_category.csv.bz2"),
]

names = ["a", "b"]
dtypes = OrderedDict([("a", "int64"), ("b", str)])

for path1, path2 in itertools.product(paths1, paths2):
    print(f"{path1} {path2}")
    df = pd.concat(
        [
            pd.read_csv(path1, names=names, dtype=dtypes, index_col=False),
            pd.read_csv(path2, names=names, dtype=dtypes, index_col=False),
        ],
        ignore_index=True,
    )

    ldf = lp.read_csv([path1, path2],
                      names=names,
                      dtype=dtypes,
                      index_col=False)

    assert ldf.equals(df)
Пример #4
0
def load_performance_csv():
    """Loads performance data

    Returns
    -------
    Legate DataFrame
    """

    cols = [
        "loan_id",
        "monthly_reporting_period",
        "servicer",
        "interest_rate",
        "current_actual_upb",
        "loan_age",
        "remaining_months_to_legal_maturity",
        "adj_remaining_months_to_maturity",
        "maturity_date",
        "msa",
        "current_loan_delinquency_status",
        "mod_flag",
        "zero_balance_code",
        "zero_balance_effective_date",
        "last_paid_installment_date",
        "foreclosed_after",
        "disposition_date",
        "foreclosure_costs",
        "prop_preservation_and_repair_costs",
        "asset_recovery_costs",
        "misc_holding_expenses",
        "holding_taxes",
        "net_sale_proceeds",
        "credit_enhancement_proceeds",
        "repurchase_make_whole_proceeds",
        "other_foreclosure_proceeds",
        "non_interest_bearing_upb",
        "principal_forgiveness_upb",
        "repurchase_make_whole_proceeds_flag",
        "foreclosure_principal_write_off_amount",
        "servicing_activity_indicator",
    ]

    dtypes = OrderedDict([
        ("loan_id", "int64"),
        ("monthly_reporting_period", "str"),
        ("servicer", "category"),
        ("interest_rate", "float64"),
        ("current_actual_upb", "float64"),
        ("loan_age", "float64"),
        ("remaining_months_to_legal_maturity", "float64"),
        ("adj_remaining_months_to_maturity", "float64"),
        ("maturity_date", "str"),
        ("msa", "float64"),
        ("current_loan_delinquency_status", "int32"),
        ("mod_flag", "category"),
        ("zero_balance_code", "category"),
        ("zero_balance_effective_date", "str"),
        ("last_paid_installment_date", "str"),
        ("foreclosed_after", "str"),
        ("disposition_date", "str"),
        ("foreclosure_costs", "float64"),
        ("prop_preservation_and_repair_costs", "float64"),
        ("asset_recovery_costs", "float64"),
        ("misc_holding_expenses", "float64"),
        ("holding_taxes", "float64"),
        ("net_sale_proceeds", "float64"),
        ("credit_enhancement_proceeds", "float64"),
        ("repurchase_make_whole_proceeds", "float64"),
        ("other_foreclosure_proceeds", "float64"),
        ("non_interest_bearing_upb", "float64"),
        ("principal_forgiveness_upb", "float64"),
        ("repurchase_make_whole_proceeds_flag", "category"),
        ("foreclosure_principal_write_off_amount", "float64"),
        ("servicing_activity_indicator", "category"),
    ])

    def dateparse(x):
        if not isinstance(x, str):
            return pd.NaT
        try:
            return datetime.strptime(x, "%m/%d/%Y")
        except ValueError:
            return datetime.strptime(x, "%m/%Y")

    out_pd = pd.read_csv(
        perf_data_path,
        names=cols,
        delimiter="|",
        index_col=False,
        dtype=dtypes,
        parse_dates=[
            "monthly_reporting_period",
            "maturity_date",
            "zero_balance_effective_date",
            "last_paid_installment_date",
            "foreclosed_after",
            "disposition_date",
        ],
        date_parser=dateparse,
    )

    out_lp = lp.read_csv(
        perf_data_path,
        names=cols,
        delimiter="|",
        index_col=False,
        dtype=dtypes,
        parse_dates=[
            "monthly_reporting_period",
            "maturity_date",
            "zero_balance_effective_date",
            "last_paid_installment_date",
            "foreclosed_after",
            "disposition_date",
        ],
    )

    assert equals(out_lp, out_pd)
Пример #5
0
import pandas as pd

from legate import pandas as lp

path = os.path.join(os.path.dirname(__file__), "files", "read_csv_index.csv")
names = ["__lvl1__", "__lvl2__", "a", "b"]
dtypes = OrderedDict([
    ("__lvl1__", "int64"),
    ("__lvl2__", "float64"),
    ("a", "int64"),
    ("b", "float64"),
])

df = pd.read_csv(path, names=names, dtype=dtypes, index_col=[1, 0])
ldf = lp.read_csv(path, names=names, dtype=dtypes, index_col=[1, 0])
assert ldf.equals(df)

df = pd.read_csv(path,
                 names=names,
                 dtype=dtypes,
                 index_col=["__lvl1__", "__lvl2__"])
ldf = lp.read_csv(path,
                  names=names,
                  dtype=dtypes,
                  index_col=["__lvl1__", "__lvl2__"])
assert ldf.equals(df)

path = os.path.join(os.path.dirname(__file__), "files",
                    "read_csv_index_and_header.csv")
Пример #6
0
# limitations under the License.
#

import os
from collections import OrderedDict

import pandas as pd

from legate import pandas as lp

path = os.path.join(os.path.dirname(__file__), "files",
                    "read_csv_datetime.csv")
names = ["a", "b"]
dtypes = OrderedDict([("a", "int64"), ("b", "str")])

df = pd.read_csv(
    path,
    names=names,
    dtype=dtypes,
    index_col=False,
)

ldf = lp.read_csv(
    path,
    names=names,
    dtype=dtypes,
    index_col=False,
)

assert ldf.equals(df)
Пример #7
0
# limitations under the License.
#

import os
from collections import OrderedDict

import pandas as pd

from legate import pandas as lp

path = os.path.join(os.path.dirname(__file__), "files", "read_csv_header.csv")
dtypes = OrderedDict([("a", "int64"), ("b", "float64")])

df = pd.read_csv(path)

ldf = lp.read_csv(path)

assert ldf.equals(df)

df = pd.read_csv(path, dtype=dtypes)

ldf = lp.read_csv(path, dtype=dtypes)

assert ldf.equals(df)

try:
    path_another = os.path.join(os.path.dirname(__file__), "files",
                                "read_csv_header_another.csv")
    ldf = lp.read_csv([path, path_another], verify_header=True)
    raise ValueError("test failed")
except ValueError:
Пример #8
0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os
from collections import OrderedDict

import pandas as pd

from legate import pandas as lp

path = os.path.join(os.path.dirname(__file__), "files", "read_csv.csv")
names = ["a", "b"]
dtypes = OrderedDict([("a", "int64"), ("b", "float64")])

df = pd.read_csv(path, names=names, dtype=dtypes)

ldf = lp.read_csv(path, names=names, dtype=dtypes)

assert ldf.equals(df)

df = pd.read_csv(path, names=names, dtype=dtypes, nrows=1)

ldf = lp.read_csv(path, names=names, dtype=dtypes, nrows=1)

assert ldf.equals(df)
true_values = ["this is true", "this is also True"]
false_values = ["this is false", "this is also FALSE"]
na_values = ["this is null", "this is NA"]

df = pd.read_csv(
    path,
    names=names,
    dtype=dtypes,
    true_values=true_values,
    false_values=false_values,
    na_values=na_values,
)
ldf = lp.read_csv(
    path,
    names=names,
    dtype=dtypes,
    true_values=true_values,
    false_values=false_values,
    na_values=na_values,
)

assert ldf.equals(df)

df = pd.read_csv(
    path,
    names=names,
    dtype=dtypes,
    true_values=true_values,
    false_values=false_values,
    na_values=na_values,
    nrows=3,
)
Пример #10
0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os

import pandas as pd

from legate import pandas as lp

path = os.path.join(os.path.dirname(__file__), "files",
                    "read_csv_datetime.csv")

df = pd.read_csv(path, parse_dates=[1])
ldf = lp.read_csv(path, parse_dates=[1])
assert ldf.equals(df)

df = pd.read_csv(path, names=["a", "b"], parse_dates=["b"])
ldf = lp.read_csv(path, names=["a", "b"], parse_dates=["b"])
assert ldf.equals(df)
Пример #11
0
# limitations under the License.
#

import os

import pandas as pd

from legate import pandas as lp

names = ["a", "b"]
paths = [
    os.path.join(os.path.dirname(__file__), "files", "read_csv_quotes.csv"),
    os.path.join(os.path.dirname(__file__), "files", "read_csv_colons.csv"),
]
quotechars = ['"', ":"]
for path, quotechar in zip(paths, quotechars):
    df = pd.read_csv(
        path,
        names=names,
        dtype="string",
        quotechar=quotechar,
        skipfooter=1,
        engine="python",
    )
    ldf = lp.read_csv(path,
                      names=names,
                      dtype="string",
                      quotechar=quotechar,
                      skipfooter=1)
    assert ldf.equals(df)