def get_modin(): files = get_files(get_data_dir()) dfs = [ pdm.read_parquet(file_name, columns=['Year', 'ArrDelay']) for file_name in files ] df = pdm.concat(dfs) return df
def test_from_parquet(): setup_parquet_file(SMALL_ROW_SIZE) pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME) ray_df = pd.read_parquet(TEST_PARQUET_FILENAME) assert ray_df_equals_pandas(ray_df, pandas_df) teardown_parquet_file()
def test_from_parquet_with_columns(): setup_parquet_file(SMALL_ROW_SIZE) pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME, columns=["col1"]) modin_df = pd.read_parquet(TEST_PARQUET_FILENAME, columns=["col1"]) assert modin_df_equals_pandas(modin_df, pandas_df) teardown_parquet_file()
def main(): logger.info(f'Starting...') logger.info( f'Parquet Stored Size: {PARQUET_FILE_PATH.stat().st_size / 1024 ** 3:.3f} GB' ) df = pd.read_parquet(PARQUET_FILE_PATH) logger.info( f'In memory Size: {df.memory_usage(deep=True).sum() / 1024 ** 3:.3f} GB' ) logger.info(f'Finished!')
def load_customer(data_folder): data_path = data_folder + "/customer.pq" df = pd.read_parquet( data_path, ) return df
def test_from_parquet(make_parquet_file): make_parquet_file(SMALL_ROW_SIZE) pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME) modin_df = pd.read_parquet(TEST_PARQUET_FILENAME) df_equals(modin_df, pandas_df)
def load_part(data_folder): data_path = data_folder + "/part.pq" df = pd.read_parquet( data_path, ) return df
def load_orders(data_folder): data_path = data_folder + "/orders.pq" df = pd.read_parquet( data_path, ) return df
def load_supplier(data_folder): data_path = data_folder + "/supplier.pq" df = pd.read_parquet( data_path, ) return df
def load_lineitem(data_folder): data_path = data_folder + "/lineitem.pq" df = pd.read_parquet( data_path, ) return df
def test_from_parquet_partitioned_columns_with_columns(make_parquet_file): make_parquet_file(SMALL_ROW_SIZE, partitioned_columns=["col1"]) pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME, columns=["col1"]) modin_df = pd.read_parquet(TEST_PARQUET_FILENAME, columns=["col1"]) df_equals(modin_df, pandas_df)
def load_region(data_folder): data_path = data_folder + "/region.pq" df = pd.read_parquet( data_path, ) return df
df_equals(modin_df, pandas_df) def test_from_parquet_pandas_index(): # Ensure modin can read parquet files written by pandas with a non-RangeIndex object pandas_df = pandas.DataFrame({ "idx": np.random.randint(0, 100_000, size=2000), "A": np.random.randint(0, 100_000, size=2000), "B": ["a", "b"] * 1000, "C": ["c"] * 2000, }) pandas_df.set_index("idx").to_parquet("tmp.parquet") # read the same parquet using modin.pandas df_equals(pd.read_parquet("tmp.parquet"), pandas.read_parquet("tmp.parquet")) pandas_df.set_index(["idx", "A"]).to_parquet("tmp.parquet") df_equals(pd.read_parquet("tmp.parquet"), pandas.read_parquet("tmp.parquet")) def test_from_parquet_hdfs(): path = "modin/pandas/test/data/hdfs.parquet" pandas_df = pandas.read_parquet(path) modin_df = pd.read_parquet(path) df_equals(modin_df, pandas_df) def test_from_json():
def test_from_parquet_partition(make_parquet_file): make_parquet_file(SMALL_ROW_SIZE, directory=True) pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME) modin_df = pd.read_parquet(TEST_PARQUET_FILENAME) df_equals(modin_df, pandas_df)
def test_from_parquet_hdfs(): path = "modin/pandas/test/data/hdfs.parquet" pandas_df = pandas.read_parquet(path) modin_df = pd.read_parquet(path) df_equals(modin_df, pandas_df)
def test_from_parquet_pandas_index(): # Ensure modin can read parquet files written by pandas with a non-RangeIndex object pandas_df = pandas.DataFrame({ "idx": np.random.randint(0, 100_000, size=2000), "A": np.random.randint(0, 100_000, size=2000), "B": ["a", "b"] * 1000, "C": ["c"] * 2000, }) filepath = "tmp.parquet" pandas_df.set_index("idx").to_parquet(filepath) # read the same parquet using modin.pandas df_equals(pd.read_parquet(filepath), pandas.read_parquet(filepath)) pandas_df.set_index(["idx", "A"]).to_parquet(filepath) df_equals(pd.read_parquet(filepath), pandas.read_parquet(filepath)) os.remove(filepath) def test_from_parquet_pandas_index_partitioned(): # Ensure modin can read parquet files written by pandas with a non-RangeIndex object pandas_df = pandas.DataFrame({ "idx": np.random.randint(0, 100_000, size=2000), "A": np.random.randint(0, 10, size=2000), "B": ["a", "b"] * 1000, "C": ["c"] * 2000,
df_equals(modin_df, pandas_df) def test_from_parquet_pandas_index(): # Ensure modin can read parquet files written by pandas with a non-RangeIndex object pandas_df = pandas.DataFrame( { "idx": np.random.randint(0, 100_000, size=2000), "A": np.random.randint(0, 100_000, size=2000), "B": ["a", "b"] * 1000, "C": ["c"] * 2000, } ) pandas_df.set_index("idx").to_parquet("tmp.parquet") # read the same parquet using modin.pandas df_equals(pd.read_parquet("tmp.parquet"), pandas.read_parquet("tmp.parquet")) pandas_df.set_index(["idx", "A"]).to_parquet("tmp.parquet") df_equals(pd.read_parquet("tmp.parquet"), pandas.read_parquet("tmp.parquet")) def test_from_parquet_hdfs(): path = "modin/pandas/test/data/hdfs.parquet" pandas_df = pandas.read_parquet(path) modin_df = pd.read_parquet(path) df_equals(modin_df, pandas_df) def test_from_json(): setup_json_file(SMALL_ROW_SIZE)
import dash import dash_html_components as html import holoviews as hv from holoviews.plotting.plotly.dash import to_dash if __name__ == "__main__": from distributed import Client, local_client client = Client() import modin.pandas as pd df = pd.read_parquet('output_file.parquet') # Load dataset dataset = hv.Dataset(df) scatter = hv.Scatter(dataset, kdims=["passenger_count"], vdims=["total_amount"]) # hist = hv.operation.histogram( # dataset, dimension="petal_width", normed=False # ) app = dash.Dash(__name__) components = to_dash(app, [ scatter, ]) app.layout = html.Div(components.children)
# Optional arguments parser.add_argument( "--train-test-split", action='store_true', help='Perform a train test split before saving the results') parser.add_argument( "--test-size", type=float, help='Size of the test split if requrested, default to 0.3', default=0.3) args = parser.parse_args() d2v_model = Doc2Vec.load(args.d2v_model) documents = pd.read_parquet(args.documents_path, engine="pyarrow") logging.info(f"Processing {documents.shape[0]} documents") # Parsing categories categories = documents.categories.apply( lambda cat: cat.split(";")).values.tolist() categories_encoder = MultiLabelBinarizer() categories_encoder.fit(categories) # Transforming into embeddings logging.info(f"Transforming into embeddings") tokenizer = RegexpTokenizer(r'\w+') y = categories_encoder.transform(categories) X = documents.apply(lambda row: d2v_model.infer_vector( [word.lower() for word in tokenizer.tokenize(row['document'])]), axis=1)._to_pandas()