def test_parse_dates_read_csv(): pandas_df = pandas.read_csv("modin/pandas/test/data/test_time_parsing.csv") modin_df = pd.read_csv("modin/pandas/test/data/test_time_parsing.csv") df_equals(modin_df, pandas_df) pandas_df = pandas.read_csv( "modin/pandas/test/data/test_time_parsing.csv", names=[ "timestamp", "symbol", "high", "low", "open", "close", "spread", "volume", ], header=0, index_col=0, encoding="utf-8", ) modin_df = pd.read_csv( "modin/pandas/test/data/test_time_parsing.csv", names=[ "timestamp", "symbol", "high", "low", "open", "close", "spread", "volume", ], header=0, index_col=0, encoding="utf-8", ) df_equals(modin_df, pandas_df) pandas_df = pandas.read_csv( "modin/pandas/test/data/test_time_parsing.csv", names=[ "timestamp", "symbol", "high", "low", "open", "close", "spread", "volume", ], header=0, index_col=0, parse_dates=["timestamp"], encoding="utf-8", ) modin_df = pd.read_csv( "modin/pandas/test/data/test_time_parsing.csv", names=[ "timestamp", "symbol", "high", "low", "open", "close", "spread", "volume", ], header=0, index_col=0, parse_dates=["timestamp"], encoding="utf-8", ) df_equals(modin_df, pandas_df) pandas_df = pandas.read_csv( "modin/pandas/test/data/test_time_parsing.csv", names=[ "timestamp", "symbol", "high", "low", "open", "close", "spread", "volume", ], header=0, index_col=2, parse_dates=["timestamp"], encoding="utf-8", ) modin_df = pd.read_csv( "modin/pandas/test/data/test_time_parsing.csv", names=[ "timestamp", "symbol", "high", "low", "open", "close", "spread", "volume", ], header=0, index_col=2, parse_dates=["timestamp"], encoding="utf-8", ) df_equals(modin_df, pandas_df)
def test_from_csv_with_usecols(usecols): fname = "modin/pandas/test/data/test_usecols.csv" pandas_df = pandas.read_csv(fname, usecols=usecols) modin_df = pd.read_csv(fname, usecols=usecols) df_equals(modin_df, pandas_df)
# + LABEL_COLUMN = "label" if smoke_test: # Test dataset with only 10,000 records. FILE_URL = "https://ray-ci-higgs.s3.us-west-2.amazonaws.com/simpleHIGGS" ".csv" else: # Full dataset. This may take a couple of minutes to load. FILE_URL = ("https://archive.ics.uci.edu/ml/machine-learning-databases" "/00280/HIGGS.csv.gz") colnames = [LABEL_COLUMN] + ["feature-%02d" % i for i in range(1, 29)] # + load_data_start_time = time.time() df = pd.read_csv(FILE_URL, names=colnames) load_data_end_time = time.time() load_data_duration = load_data_end_time - load_data_start_time print(f"Dataset loaded in {load_data_duration} seconds.") # - # Split data into training and validation. df_train, df_validation = train_test_split(df) print(df_train, df_validation) # ## Distributed Training # # The ``train_xgboost`` function contains all the logic necessary for # training using XGBoost-Ray.
def test_from_csv_index_col(make_csv_file): make_csv_file() pandas_df = pandas.read_csv(TEST_CSV_FILENAME, index_col="col1") modin_df = pd.read_csv(TEST_CSV_FILENAME, index_col="col1") df_equals(modin_df, pandas_df)
from utils import time_logger parser = argparse.ArgumentParser(description='groupby benchmark') parser.add_argument('--path', dest='path', help='path to the csv data file') parser.add_argument('--logfile', dest='logfile', help='path to the log file') args = parser.parse_args() file = args.path file_size = os.path.getsize(file) if not os.path.exists(os.path.split(args.logfile)[0]): os.makedirs(os.path.split(args.logfile)[0]) logging.basicConfig(filename=args.logfile, level=logging.INFO) df = pd.read_csv(file) blocks = df._block_partitions.flatten().tolist() ray.wait(blocks, len(blocks)) with time_logger( "Groupby + sum aggregation on axis=0: {}; Size: {} bytes".format( file, file_size)): df_groupby = df.groupby('1') blocks = df_groupby.sum()._block_partitions.flatten().tolist() ray.wait(blocks, len(blocks)) with time_logger("Groupby mean on axis=0: {}; Size: {} bytes".format( file, file_size)): blocks = df_groupby.mean()._block_partitions.flatten().tolist() ray.wait(blocks, len(blocks))
("foreclosed_after", "datetime64"), ("disposition_date", "datetime64"), ("foreclosure_costs", "float64"), ("prop_preservation_and_repair_costs", "float64"), ("asset_recovery_costs", "float64"), ("misc_holding_expenses", "float64"), ("holding_taxes", "float64"), ("net_sale_proceeds", "float64"), ("credit_enhancement_proceeds", "float64"), ("repurchase_make_whole_proceeds", "float64"), ("other_foreclosure_proceeds", "float64"), ("non_interest_bearing_upb", "float64"), ("principal_forgiveness_upb", "float64"), ("repurchase_make_whole_proceeds_flag", "category"), ("foreclosure_principal_write_off_amount", "float64"), ("servicing_activity_indicator", "category"), ]) all_but_dates = { col: valtype for (col, valtype) in dtypes.items() if valtype != "datetime64" } dates_only = [ col for (col, valtype) in dtypes.items() if valtype == "datetime64" ] df = pd.read_csv("perf.txt", delimiter="|", names=cols, dtype=all_but_dates, parse_dates=dates_only) print(df["servicer"])
df.shape """Format: feather It is common to store data in feather (binary) format specifically for pandas. It significantly improves reading speed of datasets. ## Modin A library which helps pandas to read big files and boosts its speed. All syntax is same as pandas. """ pip install modin[dask] import modin.pandas as mpd #Hardware Accelerator: GPU start = time.time() df = mpd.read_csv(link) end = time.time() print(end-start) df.head() !git clone https://github.com/rapidsai/rapidsai-csp-utils.git !bash rapidsai-csp-utils/colab/rapids-colab.sh import sys, os dist_package_index = sys.path.index('/usr/local/lib/python3.6/dist-packages') sys.path = sys.path[:dist_package_index] + ['/usr/local/lib/python3.6/site-packages'] + sys.path[dist_package_index:] sys.path exec(open('rapidsai-csp-utils/colab/update_modules.py').read(), globals())
if __name__ == "__main__": #import pandas as pd import modin.pandas as pd df1 = pd.read_csv("categories.json", dtype={ "one": "int64", "two": "category" }) df = pd.read_csv("categories.csv", names=["one", "two"], dtype={ "one": "int64", "two": "category" }) print(df.dtypes.describe()) print("type(df.dtypes[1]) = ", type(df.dtypes[1])) print("df.dtypes[1] = ", df.dtypes[1]) print("type(df.dtypes[1].categories) = ", type(df.dtypes[1].categories)) print("df.dtypes.categories = ", df.dtypes[1].categories) s = df["two"] #print("s.describe = ", s.describe()) print("type(s.dtypes.categories) = ", type(s.dtypes.categories)) print("s.dtypes.categories = ", s.dtypes.categories) print("type(s.dtypes) = ", type(s.dtypes)) print("s.dtypes = ", s.dtypes) print(s)
# Import Modin and set Ray as the compute engine import modin.pandas as pd os.environ["MODIN_ENGINE"] = "ray" # Load daal accelerated sklearn patch and import packages from the patch import daal4py.sklearn daal4py.sklearn.patch_sklearn() from sklearn.model_selection import train_test_split import sklearn.linear_model as lm # Read the data from the downloaded archive file df = pd.read_csv('ipums_education2income_1970-2010.csv.gz', compression="gzip") # ETL # clean up unneeded features keep_cols = [ "YEAR", "DATANUM", "SERIAL", "CBSERIAL", "HHWT", "CPI99", "GQ", "PERNUM", "SEX", "AGE", "INCTOT",
import modin.pandas as pd from fastai.tabular import * from utils import isVaildDate, purge_pat_files from tqdm import tqdm, trange root = Path('../test_A') u_data = root / 'user' u_data.ls() log_data = root / 'imps_log' log_data.ls() ad_static = pd.read_csv('../data/ad_static.csv', low_memory=False, encoding='utf-8') c_sz = 102400 col_names1 = ['广告请求id', '广告请求时间', '广告位id', '用户id', '曝光广告id', '广告请求时间_date'] def save_csv(row): date = str(row['广告请求时间_date']) pd.DataFrame(row).T.to_csv(f'../data/{date}_log.csv', mode='a', index=None, encoding='utf-8', header=False) def invalid_date(df_row, field='广告请求时间'): """是否删除当前行,首先转为时间格式之后再行本操作""" if not isVaildDate(str(df_row[field])): df_row[field] = np.nan
def test___repr__(): frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 100)) pandas_df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) assert repr(pandas_df) == repr(modin_df) frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 99)) pandas_df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) assert repr(pandas_df) == repr(modin_df) frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 101)) pandas_df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) assert repr(pandas_df) == repr(modin_df) frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 102)) pandas_df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) assert repr(pandas_df) == repr(modin_df) # ___repr___ method has a different code path depending on # whether the number of rows is >60; and a different code path # depending on the number of columns is >20. # Previous test cases already check the case when cols>20 # and rows>60. The cases that follow exercise the other three # combinations. # rows <= 60, cols > 20 frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(10, 100)) pandas_df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) assert repr(pandas_df) == repr(modin_df) # rows <= 60, cols <= 20 frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(10, 10)) pandas_df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) assert repr(pandas_df) == repr(modin_df) # rows > 60, cols <= 20 frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(100, 10)) pandas_df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) assert repr(pandas_df) == repr(modin_df) # Empty pandas_df = pandas.DataFrame(columns=["col{}".format(i) for i in range(100)]) modin_df = pd.DataFrame(columns=["col{}".format(i) for i in range(100)]) assert repr(pandas_df) == repr(modin_df) # From Issue #1705 string_data = """"time","device_id","lat","lng","accuracy","activity_1","activity_1_conf","activity_2","activity_2_conf","activity_3","activity_3_conf" "2016-08-26 09:00:00.206",2,60.186805,24.821049,33.6080017089844,"STILL",75,"IN_VEHICLE",5,"ON_BICYCLE",5 "2016-08-26 09:00:05.428",5,60.192928,24.767222,5,"WALKING",62,"ON_BICYCLE",29,"RUNNING",6 "2016-08-26 09:00:05.818",1,60.166382,24.700443,3,"WALKING",75,"IN_VEHICLE",5,"ON_BICYCLE",5 "2016-08-26 09:00:15.816",1,60.166254,24.700671,3,"WALKING",75,"IN_VEHICLE",5,"ON_BICYCLE",5 "2016-08-26 09:00:16.413",5,60.193055,24.767427,5,"WALKING",85,"ON_BICYCLE",15,"UNKNOWN",0 "2016-08-26 09:00:20.578",3,60.152996,24.745216,3.90000009536743,"STILL",69,"IN_VEHICLE",31,"UNKNOWN",0""" pandas_df = pandas.read_csv(io.StringIO(string_data)) modin_df = pd.read_csv(io.StringIO(string_data)) assert repr(pandas_df) == repr(modin_df)
#import pandas as pd import ray ray.init(huge_pages=True, plasma_directory="/mnt/hugepages") import modin.pandas as pd dtypes = { 'object_id': 'int32', 'mjd': 'float32', 'passband': 'int32', 'flux': 'float32', 'flux_err': 'float32', 'detected': 'int32' } PATH = '/localdisk/benchmark_datasets/plasticc' GPU_MEMORY = 16 TEST_ROWS = 453653104 OVERHEAD = 1.2 SKIP_ROWS = int((1 - GPU_MEMORY / (32.0 * OVERHEAD)) * TEST_ROWS) test = pd.read_csv( '%s/test_set.csv' % PATH, # skiprows=range(1, 1+SKIP_ROWS), dtype=dtypes)
help="path to the right csv data " "file") parser.add_argument("--logfile", dest="logfile", help="path to the log file") args = parser.parse_args() file_left = args.left file_size_left = os.path.getsize(file_left) file_right = args.right file_size_right = os.path.getsize(file_right) if not os.path.exists(os.path.split(args.logfile)[0]): os.makedirs(os.path.split(args.logfile)[0]) logging.basicConfig(filename=args.logfile, level=logging.INFO) df_left = pd.read_csv(file_left) df_right = pd.read_csv(file_right) blocks = df_left._block_partitions.flatten().tolist() ray.wait(blocks, len(blocks)) blocks = df_right._block_partitions.flatten().tolist() ray.wait(blocks, len(blocks)) with time_logger("Inner Join: {} & {}; Left Size: {} bytes; Right Size: {} " "bytes".format(file_left, file_right, file_size_left, file_size_right)): result = df_left.join(df_right, how="inner", lsuffix="left_") ray.wait(result._block_partitions.flatten().tolist()) with time_logger("Outer Join: {} & {}; Left Size: {} bytes; Right Size: {} " "bytes".format(file_left, file_right, file_size_left,
from domainapi import domain from functions import processing warnings.filterwarnings("ignore") version = datetime.now() version = str(version).replace('-','').replace(' ','')[:10] #variables subs_geelong = ['Belmont', 'Grovedale'] subs_melb = ['Yarraville'] get_data = False compute_features = False static_pkl = 'data/2019120723_data.pkl' feature_ranking = pd.read_csv('input/feature_ranking.csv') if get_data: #function format(propertyTypes,minBedrooms,minBathrooms,minCarspaces,minPrice,maxPrice,minLandArea,state,region,area,suburb,includeSurroundingSuburbs) #dont use zeros here yet needs to be fixed df = domain().listing_results(["House"],2,1,1,500000,550000,10,"VIC","","","",False) print("Main frame shape: ",df.shape) df.to_csv('data/{version}_data.csv'.format(version=version), sep='\t', encoding='utf-8') df.to_pickle('data/{version}_data.pkl'.format(version=version)) #dup_df_2 = df[df['id'].duplicated() == True] #dup_df_2 = dup_df_2.sort_values(by=['id']) #r, c = dup_df_2.shape #if r > 0: # print("duplicates in df")
#!/usr/bin/env python import matplotlib matplotlib.use("PS") import modin.pandas as pd import warnings # current version of seaborn generates a bunch of warnings that we'll ignore warnings.filterwarnings("ignore") import seaborn as sns import matplotlib.pyplot as plt sns.set(style="white", color_codes=True) iris = pd.read_csv("Iris.csv") # the iris dataset is now a Pandas DataFrame iris.head() iris["Species"].value_counts() iris.plot(kind="scatter", x="SepalLengthCm", y="SepalWidthCm") sns.jointplot(x="SepalLengthCm", y="SepalWidthCm", data=iris, size=5) sns.FacetGrid(iris, hue="Species", size=5).map( plt.scatter, "SepalLengthCm", "SepalWidthCm" ).add_legend() sns.boxplot(x="Species", y="PetalLengthCm", data=iris) ax = sns.boxplot(x="Species", y="PetalLengthCm", data=iris) ax = sns.stripplot( x="Species", y="PetalLengthCm", data=iris, jitter=True, edgecolor="gray" ) sns.violinplot(x="Species", y="PetalLengthCm", data=iris, size=6) sns.FacetGrid(iris, hue="Species", size=6).map( sns.kdeplot, "PetalLengthCm" ).add_legend() iris.drop("Id", axis=1).boxplot(by="Species", figsize=(12, 6)) from pandas.tools.plotting import andrews_curves
sns.set_style("darkgrid") import warnings def ignore_warn(*args, **kwargs): pass warnings.warn = ignore_warn # ignore annoying warning (from sklearn and seaborn) from scipy import stats from scipy.stats import norm, skew # for some statistics pd.set_option( "display.float_format", lambda x: "{:.3f}".format(x)) # Limiting floats output to 3 decimal points train = pd.read_csv("train.csv") test = pd.read_csv("test.csv") train.head(5) test.head(5) print("The train data size before dropping Id feature is : {} ".format( train.shape)) print("The test data size before dropping Id feature is : {} ".format( test.shape)) train_ID = train["Id"] test_ID = test["Id"] train.drop("Id", axis=1, inplace=True) test.drop("Id", axis=1, inplace=True) print("\nThe train data size after dropping Id feature is : {} ".format( train.shape)) print("The test data size after dropping Id feature is : {} ".format( test.shape))
import matplotlib matplotlib.use("PS") import numpy as np # linear algebra import modin.pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import matplotlib.pyplot as plt import seaborn as sns import warnings warnings.filterwarnings("ignore") data = pd.read_csv("column_2C_weka.csv") print(plt.style.available) # look at available plot styles plt.style.use("ggplot") data.head() data.info() data.describe() color_list = [ "red" if i == "Abnormal" else "green" for i in data.loc[:, "class"] ] pd.plotting.scatter_matrix( data.loc[:, data.columns != "class"], c=color_list, figsize=[15, 15], diagonal="hist", alpha=0.5, s=200, marker="*", edgecolor="black", ) plt.show() sns.countplot(x="class", data=data)
import modin.pandas as pd import numpy as np import hashlib import os features_src = '../data/nuclear_features.csv' labels_src = '../data/case_stage_files.csv' features = pd.read_csv(features_src, memory_map=True) print(features.head()) lab = pd.read_csv(labels_src) print(lab.head()) stages = lab['stage_str'].values cases = lab['case_id'].values cases_uid = np.array( [hashlib.md5(x.encode()).hexdigest() for x in cases] ) nuclei_cases = features['case_id'].values nucleus_stage = np.zeros_like(nuclei_cases) for cid in np.unique(cases_uid): cidx = nuclei_cases == cid st = np.squeeze(stages[cases_uid == cid])[0] print(cid, st) nucleus_stage[cidx] = st print(nucleus_stage.shape) features['stage_str'] = nucleus_stage
exec(open("./helpers.py").read()) src_grp = os.environ['SRC_GRP_LOCAL'] ver = modin.__version__ git = modin.__git_revision__ task = "groupby" data_name = os.path.basename(src_grp) solution = "modin" fun = ".groupby" cache = "TRUE" print("loading dataset %s" % data_name) if os.path.isfile(data_name): x = pd.read_csv(data_name) else: x = pd.read_csv(src_grp) print("grouping...") # "Groupby with lists of columns not yet supported." question = "sum v1 by id1" #1 gc.collect() t_start = timeit.default_timer() ans = x.groupby(['id1']).agg({'v1': 'sum'}) print(ans.shape) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = [ans['v1'].sum()]
def main(): import os import torch import numpy as np from sklearn.model_selection import train_test_split from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument( "--vocab-path", nargs="+", dest="vocab_path_list", help= "path to the vocab(s) to use to featurize the smiles data. if more than one vocab path is given, the vocabs" "are merged and the result is used as the vocab to featurize with", ) parser.add_argument("--smiles-path", help="path to csv of smiles strings") parser.add_argument("--smiles-col", help="column name that contains smiles strings", default=None) parser.add_argument( "--smiles-sep", help= "delimiter used to seperate smiles strings, default is set to pandas default for csv", default=",", ) parser.add_argument( "--add-bos", help="add the begin of string character to smiles data", action="store_true", ) parser.add_argument( "--add-eos", help="add the end of string character to smiles data", action="store_true", ) parser.add_argument( "--n-jobs", type=int, help="number of processes to use for parallel computations", ) parser.add_argument( "--test-size", type=float, default=0.2, help= "if specified, saves the data into a seperate train/val/test split, where" "test set will be test-size %% of the full data, val is then selected from remaining train data" "using val-size %% of the train data", ) parser.add_argument( "--val-size", type=float, default=0.1, help="%% of the training data to hold out as validation or dev set", ) parser.add_argument("--split-dataset", action="store_true") parser.add_argument( "--output-dir", help="path to output directory to store vocab and numpy arrays") args = parser.parse_args() # read the smiles strings from the csv path, modin uses multiprocessing to do this more quickly import modin.pandas as pd if args.smiles_col is None: smiles_df = pd.read_csv(args.smiles_path, header=None, sep=args.smiles_sep) smiles_list = smiles_df[0].values else: smiles_df = pd.read_csv(args.smiles_path, sep=args.smiles_sep) smiles_list = smiles_df[args.smiles_col].values # if output directory does not exist, create it if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # extract the vocab print("reading vocab...") if len(args.vocab_path_list) > 1: print("more than one vocab was specified...merging vocabs...") vocab = merge_vocab(args.vocab_path_list) else: vocab = torch.load(args.vocab_path_list[0]) # compute the integer representation of the smiles data print("extracting dataset...") data = compute_string_to_int( smiles_list, vocab, n_jobs=args.n_jobs, add_bos=args.add_bos, add_eos=args.add_eos, ) np.save(args.output_dir + "/full_data.npy", data) if args.split_dataset: # compute the splits for train/test using the full data train_data, test_data = train_test_split(data, test_size=args.test_size) # compute the splits for train/val using the remaining data train_data, val_data = train_test_split(train_data, test_size=args.val_size) np.save(args.output_dir + "/train.npy", train_data) np.save(args.output_dir + "/val.npy", val_data) np.save(args.output_dir + "/test.npy", test_data)
''' # Errors Raised: # 0 - good # 1 - Out of range of Bangalore # 2 - Less than 3 satellites in view # 3 - speed between two pings > 90kmph # 4 - distance covered is non zero in zero time # 5 - Only 1 or less reading from the device # 6 - gps is more than 20 km away from a busstop # 7 - latitude is not between -90 and 90 ''' # In[8]: #bus stop data busstops = pd.read_csv('../data/busstop_lat_long.csv') def calc_dist(row): return d.distance((row['LAT'], row['LONGITUDE']), (row['prev_LAT'], row['prev_LONGITUDE'])).km def error1(df): y = (df['LAT'] < 12) | (df['LAT'] > 14) | (df['LONGITUDE'] < 77) | (df['LONGITUDE'] > 78) df.Usable = df['Usable'].mask(y, 1) return df def error2(df):
X = [inputX for _ in range(len(model.input))] # make prediction return model.predict(X, verbose=0) EPOCHS = 50 lookback = 64 # looking back window of 128 timesteps where each timestep # is 15 minutes look_ahead = 2 # how far into the future are we trying to predict? predict_cell = "4CLTE" # The cell of interest """ We need to sample out some data """ df = pd.read_csv( "/home/aggelos/Dropbox/Diplomatiki/MObility/mobility_dataset.csv", index_col=0) times = sorted(df.index.values) last_5pct = times[-int(0.2 * len(times))] last_20pct = times[-int(0.4 * len(times))] test_df = df[(df.index >= last_5pct)] validation_df = df[((df.index >= last_20pct) & (df.index < last_5pct))] train_df = df[(df.index < last_20pct)] """We need to create our series""" train_x, train_y = preprocess_df(train_df, predict_cell, look_ahead, lookback) val_x, val_y = preprocess_df(validation_df, predict_cell, look_ahead, lookback) test_x, test_y = preprocess_df(test_df, predict_cell, look_ahead, lookback) mlp_model, mlp_history = mlp(train_x, train_y)
import pandas as pd import modin.pandas as mpd import time import os # os.environ["MODIN_ENGINE"] = "ray" # Modin will use Ray if __name__ == '__main__': name = 'us-counties.csv' s = time.time() df = pd.read_csv(name) e = time.time() print("Pandas Concat Time = {}".format(e - s)) s = time.time() mdf = mpd.read_csv(name) e = time.time() print("Modin Concat Time = {}".format(e - s))
def test_loc_multi_index(): modin_df = pd.read_csv("modin/pandas/test/data/blah.csv", header=[0, 1, 2, 3], index_col=0) pandas_df = pandas.read_csv("modin/pandas/test/data/blah.csv", header=[0, 1, 2, 3], index_col=0) df_equals(modin_df.loc[1], pandas_df.loc[1]) df_equals(modin_df.loc[1, "Presidents"], pandas_df.loc[1, "Presidents"]) df_equals( modin_df.loc[1, ("Presidents", "Pure mentions")], pandas_df.loc[1, ("Presidents", "Pure mentions")], ) assert (modin_df.loc[1, ("Presidents", "Pure mentions", "IND", "all")] == pandas_df.loc[1, ("Presidents", "Pure mentions", "IND", "all")]) df_equals(modin_df.loc[(1, 2), "Presidents"], pandas_df.loc[(1, 2), "Presidents"]) tuples = [ ("bar", "one"), ("bar", "two"), ("bar", "three"), ("bar", "four"), ("baz", "one"), ("baz", "two"), ("baz", "three"), ("baz", "four"), ("foo", "one"), ("foo", "two"), ("foo", "three"), ("foo", "four"), ("qux", "one"), ("qux", "two"), ("qux", "three"), ("qux", "four"), ] modin_index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"]) pandas_index = pandas.MultiIndex.from_tuples(tuples, names=["first", "second"]) frame_data = np.random.randint(0, 100, size=(16, 100)) modin_df = pd.DataFrame( frame_data, index=modin_index, columns=["col{}".format(i) for i in range(100)], ) pandas_df = pandas.DataFrame( frame_data, index=pandas_index, columns=["col{}".format(i) for i in range(100)], ) df_equals(modin_df.loc["bar", "col1"], pandas_df.loc["bar", "col1"]) assert modin_df.loc[("bar", "one"), "col1"] == pandas_df.loc[("bar", "one"), "col1"] df_equals( modin_df.loc["bar", ("col1", "col2")], pandas_df.loc["bar", ("col1", "col2")], ) # From issue #1456 transposed_modin = modin_df.T transposed_pandas = pandas_df.T df_equals( transposed_modin.loc[transposed_modin.index[:-2], :], transposed_pandas.loc[transposed_pandas.index[:-2], :], ) # From issue #1610 df_equals(modin_df.loc[modin_df.index], pandas_df.loc[pandas_df.index]) df_equals(modin_df.loc[modin_df.index[:7]], pandas_df.loc[pandas_df.index[:7]])
# "ray" for ray backend engine = 'ray' os.environ["MODIN_ENGINE"] = engine import modin.pandas as mpd if BENCH_READ: mpd_read_res = bench_func(mpd.read_csv, 'big_csv.csv', header=0) print(f'Modin read time: {sum(mpd_read_res) / len(mpd_read_res)}') # mpd.read_csv('big_csv.csv', header=0) pd_read_res = bench_func(pd.read_csv, 'big_csv.csv', header=0) print(f'Pandas read time: {sum(pd_read_res) / len(mpd_read_res)}') # pd.read_csv('big_csv.csv', header=0) if BENCH_APPLY: df_mpd = mpd.read_csv('abcnews-date-text.csv', header=0) df_mpd = mpd.concat([df_mpd] * 10) log_n = int(log10(len(df_mpd))) n_range = np.logspace(2, log_n, log_n - 1) md_results = perf_bench( setup_f=lambda n: df_mpd.iloc[:n].headline_text, kernels_f=[ # modin functions are lazy. Get first item of result to force computation lambda df: df.apply(mean_word_len)[0], ], n_range=n_range, ) # concatenate with results form pandarallel testing
def test_from_csv_newlines_in_quotes(): pandas_df = pandas.read_csv("modin/pandas/test/data/newlines.csv") modin_df = pd.read_csv("modin/pandas/test/data/newlines.csv") df_equals(modin_df, pandas_df)
#from pandarallel import pandarallel import swifter import gc #import dask.dataframe as dd #from dask.base import compute #import dask.multiprocessing #from functools import partial #dask.config.set(scheduler='processes') from multiprocesspandas import applyparallel import modin.pandas as pd from distributed import Client Input = sys.argv[1] Output = sys.argv[2] #FamousDNS = pd.read_csv("FamousDNS_addr.csv") dict_from_csv = pd.read_csv('FamousDNS_addr.csv', index_col=1, squeeze=True).to_dict() #priv_pub_ip2asn = pd.read_csv("ip2asn-combined.csv") #ipv4=pd.read_csv("ip2asn-v4.csv") #ipv6=pd.read_csv("ip2asn-v6.csv") #,parse_dates=['timestamp'] #read csv cosi leggiamo tutto da li e via il discorso di asn dei famosi. #prb_id,timestamp,resultset.result.rt,dst_addr,subid,country_code,asn_v4 #fieldnames = ['prb_id','timestamp','resultset.result.rt','dst_addr','country_code','asn_v4','ASN_dest,Type'] def myfunc(self): with open(Output,"a",newline='') as out: writer = csv.writer(out) if self[3] in dict_from_csv.keys(): #pos=np.where(FamousDNS["ip"]==self[3]) #as_pub=FamousDNS.iloc[pos[0][0],0] ASN_dest= dict_from_csv[self[3]]
def test_from_csv_with_args(kwargs): file_name = "modin/pandas/test/data/issue_621.csv" pandas_df = pandas.read_csv(file_name, **kwargs) modin_df = pd.read_csv(file_name, **kwargs) df_equals(modin_df, pandas_df)
src_x = os.environ['SRC_X_LOCAL'] src_y = os.environ['SRC_Y_LOCAL'] ver = "" #pd.__version__ git = "" task = "join" question = "inner join" l = [os.path.basename(src_x), os.path.basename(src_y)] data_name = '-'.join(l) solution = "modin" fun = "merge" cache = "TRUE" print("loading datasets...") x = pd.read_csv(os.path.basename(src_x)) y = pd.read_csv(os.path.basename(src_y)) print("joining...") # NotImplementedError: To contribute to Pandas on Ray, please visit github.com/modin-project/modin gc.collect() t_start = timeit.default_timer() ans = x.merge(y, how='inner', on='KEY') print(ans.shape) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = [ans['X2'].sum(), ans['Y2'].sum()] chkt = timeit.default_timer() - t_start write_log(task=task,
import matplotlib matplotlib.use("PS") import numpy as np # linear algebra import modin.pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import seaborn as sns # data visualization library import matplotlib.pyplot as plt import time data = pd.read_csv("data.csv") data.head() # head method show only first 5 rows col = data.columns print(col) y = data.diagnosis # M or B list = ["Unnamed: 32", "id", "diagnosis"] x = data.drop(list, axis=1) x.head() ax = sns.countplot(y, label="Count") # M = 212, B = 357 x.describe() data_dia = y data = x data_n_2 = (data - data.mean()) / (data.std()) # standardization data = pd.concat([y, data_n_2.iloc[:, 0:10]], axis=1) data = pd.melt(data, id_vars="diagnosis", var_name="features", value_name="value") plt.figure(figsize=(10, 10)) sns.violinplot(x="features", y="value", hue="diagnosis",
import modin.pandas as pd exec(open("./helpers.py").read()) ver = modin.__version__ git = modin.__git_revision__ task = "groupby" solution = "modin" fun = ".groupby" cache = "TRUE" data_name = os.environ['SRC_GRP_LOCAL'] src_grp = os.path.join("data", data_name+".csv") print("loading dataset %s" % data_name) x = pd.read_csv(src_grp) print("grouping...") # "Groupby with lists of columns not yet supported." question = "sum v1 by id1" #1 gc.collect() t_start = timeit.default_timer() ans = x.groupby(['id1']).agg({'v1':'sum'}) print(ans.shape) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = [ans['v1'].sum()] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)