示例#1
0
def get_modin():
    files = get_files(get_data_dir())
    dfs = [
        pdm.read_parquet(file_name, columns=['Year', 'ArrDelay'])
        for file_name in files
    ]
    df = pdm.concat(dfs)
    return df
示例#2
0
def test_from_parquet():
    setup_parquet_file(SMALL_ROW_SIZE)

    pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME)
    ray_df = pd.read_parquet(TEST_PARQUET_FILENAME)
    assert ray_df_equals_pandas(ray_df, pandas_df)

    teardown_parquet_file()
示例#3
0
def test_from_parquet_with_columns():
    setup_parquet_file(SMALL_ROW_SIZE)

    pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME, columns=["col1"])
    modin_df = pd.read_parquet(TEST_PARQUET_FILENAME, columns=["col1"])
    assert modin_df_equals_pandas(modin_df, pandas_df)

    teardown_parquet_file()
示例#4
0
def main():
    logger.info(f'Starting...')

    logger.info(
        f'Parquet Stored Size: {PARQUET_FILE_PATH.stat().st_size / 1024 ** 3:.3f} GB'
    )

    df = pd.read_parquet(PARQUET_FILE_PATH)
    logger.info(
        f'In memory Size: {df.memory_usage(deep=True).sum() / 1024 ** 3:.3f} GB'
    )

    logger.info(f'Finished!')
def load_customer(data_folder):
    data_path = data_folder + "/customer.pq"
    df = pd.read_parquet(
        data_path,
    )
    return df
示例#6
0
def test_from_parquet(make_parquet_file):
    make_parquet_file(SMALL_ROW_SIZE)

    pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME)
    modin_df = pd.read_parquet(TEST_PARQUET_FILENAME)
    df_equals(modin_df, pandas_df)
def load_part(data_folder):
    data_path = data_folder + "/part.pq"
    df = pd.read_parquet(
        data_path,
    )
    return df
def load_orders(data_folder):
    data_path = data_folder + "/orders.pq"
    df = pd.read_parquet(
        data_path,
    )
    return df
def load_supplier(data_folder):
    data_path = data_folder + "/supplier.pq"
    df = pd.read_parquet(
        data_path,
    )
    return df
def load_lineitem(data_folder):
    data_path = data_folder + "/lineitem.pq"
    df = pd.read_parquet(
        data_path,
    )
    return df
示例#11
0
def test_from_parquet_partitioned_columns_with_columns(make_parquet_file):
    make_parquet_file(SMALL_ROW_SIZE, partitioned_columns=["col1"])

    pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME, columns=["col1"])
    modin_df = pd.read_parquet(TEST_PARQUET_FILENAME, columns=["col1"])
    df_equals(modin_df, pandas_df)
def load_region(data_folder):
    data_path = data_folder + "/region.pq"
    df = pd.read_parquet(
        data_path,
    )
    return df
示例#13
0
    df_equals(modin_df, pandas_df)


def test_from_parquet_pandas_index():
    # Ensure modin can read parquet files written by pandas with a non-RangeIndex object
    pandas_df = pandas.DataFrame({
        "idx":
        np.random.randint(0, 100_000, size=2000),
        "A":
        np.random.randint(0, 100_000, size=2000),
        "B": ["a", "b"] * 1000,
        "C": ["c"] * 2000,
    })
    pandas_df.set_index("idx").to_parquet("tmp.parquet")
    # read the same parquet using modin.pandas
    df_equals(pd.read_parquet("tmp.parquet"),
              pandas.read_parquet("tmp.parquet"))

    pandas_df.set_index(["idx", "A"]).to_parquet("tmp.parquet")
    df_equals(pd.read_parquet("tmp.parquet"),
              pandas.read_parquet("tmp.parquet"))


def test_from_parquet_hdfs():
    path = "modin/pandas/test/data/hdfs.parquet"
    pandas_df = pandas.read_parquet(path)
    modin_df = pd.read_parquet(path)
    df_equals(modin_df, pandas_df)


def test_from_json():
示例#14
0
def test_from_parquet_partition(make_parquet_file):
    make_parquet_file(SMALL_ROW_SIZE, directory=True)

    pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME)
    modin_df = pd.read_parquet(TEST_PARQUET_FILENAME)
    df_equals(modin_df, pandas_df)
示例#15
0
def test_from_parquet_hdfs():
    path = "modin/pandas/test/data/hdfs.parquet"
    pandas_df = pandas.read_parquet(path)
    modin_df = pd.read_parquet(path)
    df_equals(modin_df, pandas_df)
示例#16
0

def test_from_parquet_pandas_index():
    # Ensure modin can read parquet files written by pandas with a non-RangeIndex object
    pandas_df = pandas.DataFrame({
        "idx":
        np.random.randint(0, 100_000, size=2000),
        "A":
        np.random.randint(0, 100_000, size=2000),
        "B": ["a", "b"] * 1000,
        "C": ["c"] * 2000,
    })
    filepath = "tmp.parquet"
    pandas_df.set_index("idx").to_parquet(filepath)
    # read the same parquet using modin.pandas
    df_equals(pd.read_parquet(filepath), pandas.read_parquet(filepath))

    pandas_df.set_index(["idx", "A"]).to_parquet(filepath)
    df_equals(pd.read_parquet(filepath), pandas.read_parquet(filepath))
    os.remove(filepath)


def test_from_parquet_pandas_index_partitioned():
    # Ensure modin can read parquet files written by pandas with a non-RangeIndex object
    pandas_df = pandas.DataFrame({
        "idx":
        np.random.randint(0, 100_000, size=2000),
        "A":
        np.random.randint(0, 10, size=2000),
        "B": ["a", "b"] * 1000,
        "C": ["c"] * 2000,
示例#17
0
    df_equals(modin_df, pandas_df)


def test_from_parquet_pandas_index():
    # Ensure modin can read parquet files written by pandas with a non-RangeIndex object
    pandas_df = pandas.DataFrame(
        {
            "idx": np.random.randint(0, 100_000, size=2000),
            "A": np.random.randint(0, 100_000, size=2000),
            "B": ["a", "b"] * 1000,
            "C": ["c"] * 2000,
        }
    )
    pandas_df.set_index("idx").to_parquet("tmp.parquet")
    # read the same parquet using modin.pandas
    df_equals(pd.read_parquet("tmp.parquet"), pandas.read_parquet("tmp.parquet"))

    pandas_df.set_index(["idx", "A"]).to_parquet("tmp.parquet")
    df_equals(pd.read_parquet("tmp.parquet"), pandas.read_parquet("tmp.parquet"))


def test_from_parquet_hdfs():
    path = "modin/pandas/test/data/hdfs.parquet"
    pandas_df = pandas.read_parquet(path)
    modin_df = pd.read_parquet(path)
    df_equals(modin_df, pandas_df)


def test_from_json():
    setup_json_file(SMALL_ROW_SIZE)
示例#18
0
import dash
import dash_html_components as html

import holoviews as hv
from holoviews.plotting.plotly.dash import to_dash

if __name__ == "__main__":
    from distributed import Client, local_client
    client = Client()

    import modin.pandas as pd

    df = pd.read_parquet('output_file.parquet')

    # Load dataset

    dataset = hv.Dataset(df)

    scatter = hv.Scatter(dataset,
                         kdims=["passenger_count"],
                         vdims=["total_amount"])
    # hist = hv.operation.histogram(
    #     dataset, dimension="petal_width", normed=False
    # )

    app = dash.Dash(__name__)
    components = to_dash(app, [
        scatter,
    ])

    app.layout = html.Div(components.children)
示例#19
0
# Optional arguments
parser.add_argument(
    "--train-test-split",
    action='store_true',
    help='Perform a train test split before saving the results')
parser.add_argument(
    "--test-size",
    type=float,
    help='Size of the test split if requrested, default to 0.3',
    default=0.3)

args = parser.parse_args()

d2v_model = Doc2Vec.load(args.d2v_model)

documents = pd.read_parquet(args.documents_path, engine="pyarrow")
logging.info(f"Processing {documents.shape[0]} documents")

# Parsing categories
categories = documents.categories.apply(
    lambda cat: cat.split(";")).values.tolist()
categories_encoder = MultiLabelBinarizer()
categories_encoder.fit(categories)

# Transforming into embeddings
logging.info(f"Transforming into embeddings")
tokenizer = RegexpTokenizer(r'\w+')
y = categories_encoder.transform(categories)
X = documents.apply(lambda row: d2v_model.infer_vector(
    [word.lower() for word in tokenizer.tokenize(row['document'])]),
                    axis=1)._to_pandas()