예제 #1
0
def test_parse_dates_read_csv():
    pandas_df = pandas.read_csv("modin/pandas/test/data/test_time_parsing.csv")
    modin_df = pd.read_csv("modin/pandas/test/data/test_time_parsing.csv")
    df_equals(modin_df, pandas_df)

    pandas_df = pandas.read_csv(
        "modin/pandas/test/data/test_time_parsing.csv",
        names=[
            "timestamp",
            "symbol",
            "high",
            "low",
            "open",
            "close",
            "spread",
            "volume",
        ],
        header=0,
        index_col=0,
        encoding="utf-8",
    )
    modin_df = pd.read_csv(
        "modin/pandas/test/data/test_time_parsing.csv",
        names=[
            "timestamp",
            "symbol",
            "high",
            "low",
            "open",
            "close",
            "spread",
            "volume",
        ],
        header=0,
        index_col=0,
        encoding="utf-8",
    )
    df_equals(modin_df, pandas_df)

    pandas_df = pandas.read_csv(
        "modin/pandas/test/data/test_time_parsing.csv",
        names=[
            "timestamp",
            "symbol",
            "high",
            "low",
            "open",
            "close",
            "spread",
            "volume",
        ],
        header=0,
        index_col=0,
        parse_dates=["timestamp"],
        encoding="utf-8",
    )
    modin_df = pd.read_csv(
        "modin/pandas/test/data/test_time_parsing.csv",
        names=[
            "timestamp",
            "symbol",
            "high",
            "low",
            "open",
            "close",
            "spread",
            "volume",
        ],
        header=0,
        index_col=0,
        parse_dates=["timestamp"],
        encoding="utf-8",
    )
    df_equals(modin_df, pandas_df)

    pandas_df = pandas.read_csv(
        "modin/pandas/test/data/test_time_parsing.csv",
        names=[
            "timestamp",
            "symbol",
            "high",
            "low",
            "open",
            "close",
            "spread",
            "volume",
        ],
        header=0,
        index_col=2,
        parse_dates=["timestamp"],
        encoding="utf-8",
    )
    modin_df = pd.read_csv(
        "modin/pandas/test/data/test_time_parsing.csv",
        names=[
            "timestamp",
            "symbol",
            "high",
            "low",
            "open",
            "close",
            "spread",
            "volume",
        ],
        header=0,
        index_col=2,
        parse_dates=["timestamp"],
        encoding="utf-8",
    )
    df_equals(modin_df, pandas_df)
예제 #2
0
def test_from_csv_with_usecols(usecols):
    fname = "modin/pandas/test/data/test_usecols.csv"
    pandas_df = pandas.read_csv(fname, usecols=usecols)
    modin_df = pd.read_csv(fname, usecols=usecols)
    df_equals(modin_df, pandas_df)
예제 #3
0
# +
LABEL_COLUMN = "label"
if smoke_test:
    # Test dataset with only 10,000 records.
    FILE_URL = "https://ray-ci-higgs.s3.us-west-2.amazonaws.com/simpleHIGGS" ".csv"
else:
    # Full dataset. This may take a couple of minutes to load.
    FILE_URL = ("https://archive.ics.uci.edu/ml/machine-learning-databases"
                "/00280/HIGGS.csv.gz")

colnames = [LABEL_COLUMN] + ["feature-%02d" % i for i in range(1, 29)]

# +
load_data_start_time = time.time()

df = pd.read_csv(FILE_URL, names=colnames)

load_data_end_time = time.time()
load_data_duration = load_data_end_time - load_data_start_time
print(f"Dataset loaded in {load_data_duration} seconds.")
# -

# Split data into training and validation.

df_train, df_validation = train_test_split(df)
print(df_train, df_validation)

# ## Distributed Training
#
# The ``train_xgboost`` function contains all the logic necessary for
# training using XGBoost-Ray.
예제 #4
0
def test_from_csv_index_col(make_csv_file):
    make_csv_file()

    pandas_df = pandas.read_csv(TEST_CSV_FILENAME, index_col="col1")
    modin_df = pd.read_csv(TEST_CSV_FILENAME, index_col="col1")
    df_equals(modin_df, pandas_df)
예제 #5
0
from utils import time_logger

parser = argparse.ArgumentParser(description='groupby benchmark')
parser.add_argument('--path', dest='path', help='path to the csv data file')
parser.add_argument('--logfile', dest='logfile', help='path to the log file')
args = parser.parse_args()
file = args.path
file_size = os.path.getsize(file)

if not os.path.exists(os.path.split(args.logfile)[0]):
    os.makedirs(os.path.split(args.logfile)[0])

logging.basicConfig(filename=args.logfile, level=logging.INFO)

df = pd.read_csv(file)
blocks = df._block_partitions.flatten().tolist()
ray.wait(blocks, len(blocks))

with time_logger(
        "Groupby + sum aggregation on axis=0: {}; Size: {} bytes".format(
            file, file_size)):
    df_groupby = df.groupby('1')
    blocks = df_groupby.sum()._block_partitions.flatten().tolist()
    ray.wait(blocks, len(blocks))

with time_logger("Groupby mean on axis=0: {}; Size: {} bytes".format(
        file, file_size)):
    blocks = df_groupby.mean()._block_partitions.flatten().tolist()
    ray.wait(blocks, len(blocks))
예제 #6
0
        ("foreclosed_after", "datetime64"),
        ("disposition_date", "datetime64"),
        ("foreclosure_costs", "float64"),
        ("prop_preservation_and_repair_costs", "float64"),
        ("asset_recovery_costs", "float64"),
        ("misc_holding_expenses", "float64"),
        ("holding_taxes", "float64"),
        ("net_sale_proceeds", "float64"),
        ("credit_enhancement_proceeds", "float64"),
        ("repurchase_make_whole_proceeds", "float64"),
        ("other_foreclosure_proceeds", "float64"),
        ("non_interest_bearing_upb", "float64"),
        ("principal_forgiveness_upb", "float64"),
        ("repurchase_make_whole_proceeds_flag", "category"),
        ("foreclosure_principal_write_off_amount", "float64"),
        ("servicing_activity_indicator", "category"),
    ])
    all_but_dates = {
        col: valtype
        for (col, valtype) in dtypes.items() if valtype != "datetime64"
    }
    dates_only = [
        col for (col, valtype) in dtypes.items() if valtype == "datetime64"
    ]
    df = pd.read_csv("perf.txt",
                     delimiter="|",
                     names=cols,
                     dtype=all_but_dates,
                     parse_dates=dates_only)
    print(df["servicer"])
df.shape

"""Format: feather
It is common to store data in feather (binary) format specifically for pandas. It significantly improves reading speed of datasets.

## Modin

A library which helps pandas to read big files and boosts its speed. All syntax is same as pandas.
"""

pip install modin[dask]

import modin.pandas as mpd    #Hardware Accelerator: GPU
start = time.time()
df = mpd.read_csv(link)
end = time.time()
print(end-start)

df.head()

!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!bash rapidsai-csp-utils/colab/rapids-colab.sh

import sys, os

dist_package_index = sys.path.index('/usr/local/lib/python3.6/dist-packages')
sys.path = sys.path[:dist_package_index] + ['/usr/local/lib/python3.6/site-packages'] + sys.path[dist_package_index:]
sys.path
exec(open('rapidsai-csp-utils/colab/update_modules.py').read(), globals())
예제 #8
0
if __name__ == "__main__":
    #import pandas as pd
    import modin.pandas as pd
    df1 = pd.read_csv("categories.json",
                      dtype={
                          "one": "int64",
                          "two": "category"
                      })

    df = pd.read_csv("categories.csv",
                     names=["one", "two"],
                     dtype={
                         "one": "int64",
                         "two": "category"
                     })

    print(df.dtypes.describe())
    print("type(df.dtypes[1]) = ", type(df.dtypes[1]))
    print("df.dtypes[1] = ", df.dtypes[1])
    print("type(df.dtypes[1].categories) = ", type(df.dtypes[1].categories))
    print("df.dtypes.categories = ", df.dtypes[1].categories)
    s = df["two"]
    #print("s.describe = ", s.describe())
    print("type(s.dtypes.categories) = ", type(s.dtypes.categories))
    print("s.dtypes.categories = ", s.dtypes.categories)
    print("type(s.dtypes) = ", type(s.dtypes))
    print("s.dtypes = ", s.dtypes)
    print(s)
예제 #9
0
# Import Modin and set Ray as the compute engine
import modin.pandas as pd

os.environ["MODIN_ENGINE"] = "ray"

# Load daal accelerated sklearn patch and import packages from the patch
import daal4py.sklearn

daal4py.sklearn.patch_sklearn()

from sklearn.model_selection import train_test_split
import sklearn.linear_model as lm

# Read the data from the downloaded archive file
df = pd.read_csv('ipums_education2income_1970-2010.csv.gz', compression="gzip")

# ETL
# clean up unneeded features
keep_cols = [
    "YEAR",
    "DATANUM",
    "SERIAL",
    "CBSERIAL",
    "HHWT",
    "CPI99",
    "GQ",
    "PERNUM",
    "SEX",
    "AGE",
    "INCTOT",
예제 #10
0
import modin.pandas as pd
from fastai.tabular import *
from utils import isVaildDate, purge_pat_files
from tqdm import tqdm, trange

root = Path('../test_A')
u_data = root / 'user'
u_data.ls()
log_data = root / 'imps_log'
log_data.ls()

ad_static = pd.read_csv('../data/ad_static.csv',
                        low_memory=False,
                        encoding='utf-8')
c_sz = 102400
col_names1 = ['广告请求id', '广告请求时间', '广告位id', '用户id', '曝光广告id', '广告请求时间_date']


def save_csv(row):
    date = str(row['广告请求时间_date'])
    pd.DataFrame(row).T.to_csv(f'../data/{date}_log.csv',
                               mode='a',
                               index=None,
                               encoding='utf-8',
                               header=False)


def invalid_date(df_row, field='广告请求时间'):
    """是否删除当前行,首先转为时间格式之后再行本操作"""
    if not isVaildDate(str(df_row[field])):
        df_row[field] = np.nan
예제 #11
0
def test___repr__():
    frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 100))
    pandas_df = pandas.DataFrame(frame_data)
    modin_df = pd.DataFrame(frame_data)
    assert repr(pandas_df) == repr(modin_df)

    frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 99))
    pandas_df = pandas.DataFrame(frame_data)
    modin_df = pd.DataFrame(frame_data)
    assert repr(pandas_df) == repr(modin_df)

    frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 101))
    pandas_df = pandas.DataFrame(frame_data)
    modin_df = pd.DataFrame(frame_data)
    assert repr(pandas_df) == repr(modin_df)

    frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 102))
    pandas_df = pandas.DataFrame(frame_data)
    modin_df = pd.DataFrame(frame_data)
    assert repr(pandas_df) == repr(modin_df)

    # ___repr___ method has a different code path depending on
    # whether the number of rows is >60; and a different code path
    # depending on the number of columns is >20.
    # Previous test cases already check the case when cols>20
    # and rows>60. The cases that follow exercise the other three
    # combinations.
    # rows <= 60, cols > 20
    frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(10, 100))
    pandas_df = pandas.DataFrame(frame_data)
    modin_df = pd.DataFrame(frame_data)

    assert repr(pandas_df) == repr(modin_df)

    # rows <= 60, cols <= 20
    frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(10, 10))
    pandas_df = pandas.DataFrame(frame_data)
    modin_df = pd.DataFrame(frame_data)

    assert repr(pandas_df) == repr(modin_df)

    # rows > 60, cols <= 20
    frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(100, 10))
    pandas_df = pandas.DataFrame(frame_data)
    modin_df = pd.DataFrame(frame_data)

    assert repr(pandas_df) == repr(modin_df)

    # Empty
    pandas_df = pandas.DataFrame(columns=["col{}".format(i) for i in range(100)])
    modin_df = pd.DataFrame(columns=["col{}".format(i) for i in range(100)])

    assert repr(pandas_df) == repr(modin_df)

    # From Issue #1705
    string_data = """"time","device_id","lat","lng","accuracy","activity_1","activity_1_conf","activity_2","activity_2_conf","activity_3","activity_3_conf"
"2016-08-26 09:00:00.206",2,60.186805,24.821049,33.6080017089844,"STILL",75,"IN_VEHICLE",5,"ON_BICYCLE",5
"2016-08-26 09:00:05.428",5,60.192928,24.767222,5,"WALKING",62,"ON_BICYCLE",29,"RUNNING",6
"2016-08-26 09:00:05.818",1,60.166382,24.700443,3,"WALKING",75,"IN_VEHICLE",5,"ON_BICYCLE",5
"2016-08-26 09:00:15.816",1,60.166254,24.700671,3,"WALKING",75,"IN_VEHICLE",5,"ON_BICYCLE",5
"2016-08-26 09:00:16.413",5,60.193055,24.767427,5,"WALKING",85,"ON_BICYCLE",15,"UNKNOWN",0
"2016-08-26 09:00:20.578",3,60.152996,24.745216,3.90000009536743,"STILL",69,"IN_VEHICLE",31,"UNKNOWN",0"""
    pandas_df = pandas.read_csv(io.StringIO(string_data))
    modin_df = pd.read_csv(io.StringIO(string_data))
    assert repr(pandas_df) == repr(modin_df)
예제 #12
0
#import pandas as pd
import ray
ray.init(huge_pages=True, plasma_directory="/mnt/hugepages")
import modin.pandas as pd

dtypes = {
    'object_id': 'int32',
    'mjd': 'float32',
    'passband': 'int32',
    'flux': 'float32',
    'flux_err': 'float32',
    'detected': 'int32'
}

PATH = '/localdisk/benchmark_datasets/plasticc'
GPU_MEMORY = 16
TEST_ROWS = 453653104
OVERHEAD = 1.2
SKIP_ROWS = int((1 - GPU_MEMORY / (32.0 * OVERHEAD)) * TEST_ROWS)

test = pd.read_csv(
    '%s/test_set.csv' % PATH,
    # skiprows=range(1, 1+SKIP_ROWS),
    dtype=dtypes)
예제 #13
0
                    help="path to the right csv data "
                    "file")
parser.add_argument("--logfile", dest="logfile", help="path to the log file")
args = parser.parse_args()
file_left = args.left
file_size_left = os.path.getsize(file_left)

file_right = args.right
file_size_right = os.path.getsize(file_right)

if not os.path.exists(os.path.split(args.logfile)[0]):
    os.makedirs(os.path.split(args.logfile)[0])

logging.basicConfig(filename=args.logfile, level=logging.INFO)

df_left = pd.read_csv(file_left)
df_right = pd.read_csv(file_right)

blocks = df_left._block_partitions.flatten().tolist()
ray.wait(blocks, len(blocks))
blocks = df_right._block_partitions.flatten().tolist()
ray.wait(blocks, len(blocks))

with time_logger("Inner Join: {} & {}; Left Size: {} bytes; Right Size: {} "
                 "bytes".format(file_left, file_right, file_size_left,
                                file_size_right)):
    result = df_left.join(df_right, how="inner", lsuffix="left_")
    ray.wait(result._block_partitions.flatten().tolist())

with time_logger("Outer Join: {} & {}; Left Size: {} bytes; Right Size: {} "
                 "bytes".format(file_left, file_right, file_size_left,
예제 #14
0
from domainapi import domain
from functions import processing

warnings.filterwarnings("ignore")


version = datetime.now()
version = str(version).replace('-','').replace(' ','')[:10]

#variables
subs_geelong = ['Belmont', 'Grovedale']
subs_melb = ['Yarraville']
get_data = False
compute_features = False
static_pkl = 'data/2019120723_data.pkl'
feature_ranking = pd.read_csv('input/feature_ranking.csv')

if get_data:
    #function format(propertyTypes,minBedrooms,minBathrooms,minCarspaces,minPrice,maxPrice,minLandArea,state,region,area,suburb,includeSurroundingSuburbs)
    #dont use zeros here yet needs to be fixed
    df = domain().listing_results(["House"],2,1,1,500000,550000,10,"VIC","","","",False)

    print("Main frame shape: ",df.shape)

    df.to_csv('data/{version}_data.csv'.format(version=version), sep='\t', encoding='utf-8')
    df.to_pickle('data/{version}_data.pkl'.format(version=version))
    #dup_df_2 = df[df['id'].duplicated() == True]
    #dup_df_2 = dup_df_2.sort_values(by=['id'])
    #r, c = dup_df_2.shape
    #if r > 0:
    #    print("duplicates in df")
예제 #15
0
#!/usr/bin/env python
import matplotlib

matplotlib.use("PS")
import modin.pandas as pd
import warnings  # current version of seaborn generates a bunch of warnings that we'll ignore

warnings.filterwarnings("ignore")
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="white", color_codes=True)
iris = pd.read_csv("Iris.csv")  # the iris dataset is now a Pandas DataFrame
iris.head()
iris["Species"].value_counts()
iris.plot(kind="scatter", x="SepalLengthCm", y="SepalWidthCm")
sns.jointplot(x="SepalLengthCm", y="SepalWidthCm", data=iris, size=5)
sns.FacetGrid(iris, hue="Species", size=5).map(
    plt.scatter, "SepalLengthCm", "SepalWidthCm"
).add_legend()
sns.boxplot(x="Species", y="PetalLengthCm", data=iris)
ax = sns.boxplot(x="Species", y="PetalLengthCm", data=iris)
ax = sns.stripplot(
    x="Species", y="PetalLengthCm", data=iris, jitter=True, edgecolor="gray"
)
sns.violinplot(x="Species", y="PetalLengthCm", data=iris, size=6)
sns.FacetGrid(iris, hue="Species", size=6).map(
    sns.kdeplot, "PetalLengthCm"
).add_legend()
iris.drop("Id", axis=1).boxplot(by="Species", figsize=(12, 6))
from pandas.tools.plotting import andrews_curves
예제 #16
0
파일: kaggle4.py 프로젝트: yyz940922/modin
sns.set_style("darkgrid")
import warnings


def ignore_warn(*args, **kwargs):
    pass


warnings.warn = ignore_warn  # ignore annoying warning (from sklearn and seaborn)
from scipy import stats
from scipy.stats import norm, skew  # for some statistics

pd.set_option(
    "display.float_format",
    lambda x: "{:.3f}".format(x))  # Limiting floats output to 3 decimal points
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train.head(5)
test.head(5)
print("The train data size before dropping Id feature is : {} ".format(
    train.shape))
print("The test data size before dropping Id feature is : {} ".format(
    test.shape))
train_ID = train["Id"]
test_ID = test["Id"]
train.drop("Id", axis=1, inplace=True)
test.drop("Id", axis=1, inplace=True)
print("\nThe train data size after dropping Id feature is : {} ".format(
    train.shape))
print("The test data size after dropping Id feature is : {} ".format(
    test.shape))
예제 #17
0
import matplotlib

matplotlib.use("PS")
import numpy as np  # linear algebra
import modin.pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")
data = pd.read_csv("column_2C_weka.csv")
print(plt.style.available)  # look at available plot styles
plt.style.use("ggplot")
data.head()
data.info()
data.describe()
color_list = [
    "red" if i == "Abnormal" else "green" for i in data.loc[:, "class"]
]
pd.plotting.scatter_matrix(
    data.loc[:, data.columns != "class"],
    c=color_list,
    figsize=[15, 15],
    diagonal="hist",
    alpha=0.5,
    s=200,
    marker="*",
    edgecolor="black",
)
plt.show()
sns.countplot(x="class", data=data)
예제 #18
0
import modin.pandas as pd
import numpy as np
import hashlib
import os

features_src = '../data/nuclear_features.csv'
labels_src = '../data/case_stage_files.csv'

features = pd.read_csv(features_src, memory_map=True)
print(features.head())

lab = pd.read_csv(labels_src)
print(lab.head())

stages = lab['stage_str'].values
cases = lab['case_id'].values
cases_uid = np.array(
    [hashlib.md5(x.encode()).hexdigest() for x in cases]
)
nuclei_cases = features['case_id'].values

nucleus_stage = np.zeros_like(nuclei_cases)
for cid in np.unique(cases_uid):
    cidx = nuclei_cases == cid
    st = np.squeeze(stages[cases_uid == cid])[0]
    print(cid, st)
    nucleus_stage[cidx] = st

print(nucleus_stage.shape)
features['stage_str'] = nucleus_stage
예제 #19
0
exec(open("./helpers.py").read())

src_grp = os.environ['SRC_GRP_LOCAL']

ver = modin.__version__
git = modin.__git_revision__
task = "groupby"
data_name = os.path.basename(src_grp)
solution = "modin"
fun = ".groupby"
cache = "TRUE"

print("loading dataset %s" % data_name)

if os.path.isfile(data_name):
    x = pd.read_csv(data_name)
else:
    x = pd.read_csv(src_grp)

print("grouping...")

# "Groupby with lists of columns not yet supported."
question = "sum v1 by id1"  #1
gc.collect()
t_start = timeit.default_timer()
ans = x.groupby(['id1']).agg({'v1': 'sum'})
print(ans.shape)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = [ans['v1'].sum()]
예제 #20
0
def main():
    import os
    import torch
    import numpy as np
    from sklearn.model_selection import train_test_split
    from argparse import ArgumentParser

    parser = ArgumentParser()
    parser.add_argument(
        "--vocab-path",
        nargs="+",
        dest="vocab_path_list",
        help=
        "path to the vocab(s) to use to featurize the smiles data. if more than one vocab path is given, the vocabs"
        "are merged and the result is used as the vocab to featurize with",
    )
    parser.add_argument("--smiles-path", help="path to csv of smiles strings")
    parser.add_argument("--smiles-col",
                        help="column name that contains smiles strings",
                        default=None)
    parser.add_argument(
        "--smiles-sep",
        help=
        "delimiter used to seperate smiles strings, default is set to pandas default for csv",
        default=",",
    )
    parser.add_argument(
        "--add-bos",
        help="add the begin of string character to smiles data",
        action="store_true",
    )
    parser.add_argument(
        "--add-eos",
        help="add the end of string character to smiles data",
        action="store_true",
    )
    parser.add_argument(
        "--n-jobs",
        type=int,
        help="number of processes to use for parallel computations",
    )
    parser.add_argument(
        "--test-size",
        type=float,
        default=0.2,
        help=
        "if specified, saves the data into a seperate train/val/test split, where"
        "test set will be test-size %% of the full data, val is then selected from remaining train data"
        "using val-size %% of the train data",
    )
    parser.add_argument(
        "--val-size",
        type=float,
        default=0.1,
        help="%% of the training data to hold out as validation or dev set",
    )
    parser.add_argument("--split-dataset", action="store_true")
    parser.add_argument(
        "--output-dir",
        help="path to output directory to store vocab and numpy arrays")
    args = parser.parse_args()

    # read the smiles strings from the csv path, modin uses multiprocessing to do this more quickly
    import modin.pandas as pd

    if args.smiles_col is None:
        smiles_df = pd.read_csv(args.smiles_path,
                                header=None,
                                sep=args.smiles_sep)
        smiles_list = smiles_df[0].values

    else:
        smiles_df = pd.read_csv(args.smiles_path, sep=args.smiles_sep)
        smiles_list = smiles_df[args.smiles_col].values

    # if output directory does not exist, create it
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # extract the vocab
    print("reading vocab...")

    if len(args.vocab_path_list) > 1:
        print("more than one vocab was specified...merging vocabs...")
        vocab = merge_vocab(args.vocab_path_list)

    else:
        vocab = torch.load(args.vocab_path_list[0])

    # compute the integer representation of the smiles data
    print("extracting dataset...")
    data = compute_string_to_int(
        smiles_list,
        vocab,
        n_jobs=args.n_jobs,
        add_bos=args.add_bos,
        add_eos=args.add_eos,
    )
    np.save(args.output_dir + "/full_data.npy", data)

    if args.split_dataset:
        # compute the splits for train/test using the full data
        train_data, test_data = train_test_split(data,
                                                 test_size=args.test_size)
        # compute the splits for train/val using the remaining data
        train_data, val_data = train_test_split(train_data,
                                                test_size=args.val_size)

        np.save(args.output_dir + "/train.npy", train_data)
        np.save(args.output_dir + "/val.npy", val_data)
        np.save(args.output_dir + "/test.npy", test_data)
예제 #21
0
'''
# Errors Raised:
# 0 - good
# 1 - Out of range of Bangalore
# 2 - Less than 3 satellites in view
# 3 - speed between two pings > 90kmph
# 4 - distance covered is non zero in zero time
# 5 - Only 1 or less reading from the device
# 6 - gps is more than 20 km away from a busstop
# 7 - latitude is not between -90 and 90
'''

# In[8]:

#bus stop data
busstops = pd.read_csv('../data/busstop_lat_long.csv')


def calc_dist(row):
    return d.distance((row['LAT'], row['LONGITUDE']),
                      (row['prev_LAT'], row['prev_LONGITUDE'])).km


def error1(df):
    y = (df['LAT'] < 12) | (df['LAT'] > 14) | (df['LONGITUDE'] <
                                               77) | (df['LONGITUDE'] > 78)
    df.Usable = df['Usable'].mask(y, 1)
    return df


def error2(df):
예제 #22
0
    X = [inputX for _ in range(len(model.input))]
    # make prediction
    return model.predict(X, verbose=0)


EPOCHS = 50

lookback = 64  # looking back window of 128 timesteps where each timestep
# is 15 minutes

look_ahead = 2  # how far into the future are we trying to predict?

predict_cell = "4CLTE"  # The cell of interest
""" We need to sample out some data """
df = pd.read_csv(
    "/home/aggelos/Dropbox/Diplomatiki/MObility/mobility_dataset.csv",
    index_col=0)

times = sorted(df.index.values)
last_5pct = times[-int(0.2 * len(times))]
last_20pct = times[-int(0.4 * len(times))]

test_df = df[(df.index >= last_5pct)]
validation_df = df[((df.index >= last_20pct) & (df.index < last_5pct))]
train_df = df[(df.index < last_20pct)]
"""We need to create our series"""
train_x, train_y = preprocess_df(train_df, predict_cell, look_ahead, lookback)
val_x, val_y = preprocess_df(validation_df, predict_cell, look_ahead, lookback)
test_x, test_y = preprocess_df(test_df, predict_cell, look_ahead, lookback)

mlp_model, mlp_history = mlp(train_x, train_y)
예제 #23
0
import pandas as pd
import modin.pandas as mpd
import time
import os

# os.environ["MODIN_ENGINE"] = "ray"  # Modin will use Ray


if __name__ == '__main__':
    name = 'us-counties.csv'

    s = time.time()
    df = pd.read_csv(name)
    e = time.time()
    print("Pandas Concat Time = {}".format(e - s))


    s = time.time()
    mdf = mpd.read_csv(name)
    e = time.time()
    print("Modin Concat Time = {}".format(e - s))
예제 #24
0
def test_loc_multi_index():
    modin_df = pd.read_csv("modin/pandas/test/data/blah.csv",
                           header=[0, 1, 2, 3],
                           index_col=0)
    pandas_df = pandas.read_csv("modin/pandas/test/data/blah.csv",
                                header=[0, 1, 2, 3],
                                index_col=0)

    df_equals(modin_df.loc[1], pandas_df.loc[1])
    df_equals(modin_df.loc[1, "Presidents"], pandas_df.loc[1, "Presidents"])
    df_equals(
        modin_df.loc[1, ("Presidents", "Pure mentions")],
        pandas_df.loc[1, ("Presidents", "Pure mentions")],
    )
    assert (modin_df.loc[1, ("Presidents", "Pure mentions", "IND",
                             "all")] == pandas_df.loc[1, ("Presidents",
                                                          "Pure mentions",
                                                          "IND", "all")])
    df_equals(modin_df.loc[(1, 2), "Presidents"], pandas_df.loc[(1, 2),
                                                                "Presidents"])

    tuples = [
        ("bar", "one"),
        ("bar", "two"),
        ("bar", "three"),
        ("bar", "four"),
        ("baz", "one"),
        ("baz", "two"),
        ("baz", "three"),
        ("baz", "four"),
        ("foo", "one"),
        ("foo", "two"),
        ("foo", "three"),
        ("foo", "four"),
        ("qux", "one"),
        ("qux", "two"),
        ("qux", "three"),
        ("qux", "four"),
    ]

    modin_index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"])
    pandas_index = pandas.MultiIndex.from_tuples(tuples,
                                                 names=["first", "second"])
    frame_data = np.random.randint(0, 100, size=(16, 100))
    modin_df = pd.DataFrame(
        frame_data,
        index=modin_index,
        columns=["col{}".format(i) for i in range(100)],
    )
    pandas_df = pandas.DataFrame(
        frame_data,
        index=pandas_index,
        columns=["col{}".format(i) for i in range(100)],
    )
    df_equals(modin_df.loc["bar", "col1"], pandas_df.loc["bar", "col1"])
    assert modin_df.loc[("bar", "one"),
                        "col1"] == pandas_df.loc[("bar", "one"), "col1"]
    df_equals(
        modin_df.loc["bar", ("col1", "col2")],
        pandas_df.loc["bar", ("col1", "col2")],
    )

    # From issue #1456
    transposed_modin = modin_df.T
    transposed_pandas = pandas_df.T
    df_equals(
        transposed_modin.loc[transposed_modin.index[:-2], :],
        transposed_pandas.loc[transposed_pandas.index[:-2], :],
    )

    # From issue #1610
    df_equals(modin_df.loc[modin_df.index], pandas_df.loc[pandas_df.index])
    df_equals(modin_df.loc[modin_df.index[:7]],
              pandas_df.loc[pandas_df.index[:7]])
예제 #25
0
    # "ray" for ray backend
    engine = 'ray'
    os.environ["MODIN_ENGINE"] = engine
    import modin.pandas as mpd

    if BENCH_READ:
        mpd_read_res = bench_func(mpd.read_csv, 'big_csv.csv', header=0)
        print(f'Modin read time: {sum(mpd_read_res) / len(mpd_read_res)}')
        # mpd.read_csv('big_csv.csv', header=0)

        pd_read_res = bench_func(pd.read_csv, 'big_csv.csv', header=0)
        print(f'Pandas read time: {sum(pd_read_res) / len(mpd_read_res)}')
        # pd.read_csv('big_csv.csv', header=0)

    if BENCH_APPLY:
        df_mpd = mpd.read_csv('abcnews-date-text.csv', header=0)
        df_mpd = mpd.concat([df_mpd] * 10)

        log_n = int(log10(len(df_mpd)))
        n_range = np.logspace(2, log_n, log_n - 1)

        md_results = perf_bench(
            setup_f=lambda n: df_mpd.iloc[:n].headline_text,
            kernels_f=[
                # modin functions are lazy. Get first item of result to force computation
                lambda df: df.apply(mean_word_len)[0],
            ],
            n_range=n_range,
        )

        # concatenate with results form pandarallel testing
예제 #26
0
def test_from_csv_newlines_in_quotes():
    pandas_df = pandas.read_csv("modin/pandas/test/data/newlines.csv")
    modin_df = pd.read_csv("modin/pandas/test/data/newlines.csv")
    df_equals(modin_df, pandas_df)
예제 #27
0
#from pandarallel import pandarallel
import swifter
import gc
#import dask.dataframe as dd
#from dask.base import compute
#import dask.multiprocessing
#from functools import partial
#dask.config.set(scheduler='processes')
from multiprocesspandas import applyparallel
import modin.pandas as pd
from distributed import Client

Input = sys.argv[1]
Output = sys.argv[2]
#FamousDNS = pd.read_csv("FamousDNS_addr.csv")
dict_from_csv = pd.read_csv('FamousDNS_addr.csv', index_col=1, squeeze=True).to_dict()
#priv_pub_ip2asn = pd.read_csv("ip2asn-combined.csv")
#ipv4=pd.read_csv("ip2asn-v4.csv")
#ipv6=pd.read_csv("ip2asn-v6.csv") #,parse_dates=['timestamp']
#read csv cosi leggiamo tutto da li e via il discorso di asn dei famosi.
#prb_id,timestamp,resultset.result.rt,dst_addr,subid,country_code,asn_v4
#fieldnames = ['prb_id','timestamp','resultset.result.rt','dst_addr','country_code','asn_v4','ASN_dest,Type']


def myfunc(self):
	with open(Output,"a",newline='') as out:
		writer = csv.writer(out)
		if self[3] in dict_from_csv.keys():
			#pos=np.where(FamousDNS["ip"]==self[3])
			#as_pub=FamousDNS.iloc[pos[0][0],0]
			ASN_dest= dict_from_csv[self[3]]
예제 #28
0
def test_from_csv_with_args(kwargs):
    file_name = "modin/pandas/test/data/issue_621.csv"
    pandas_df = pandas.read_csv(file_name, **kwargs)
    modin_df = pd.read_csv(file_name, **kwargs)
    df_equals(modin_df, pandas_df)
예제 #29
0
src_x = os.environ['SRC_X_LOCAL']
src_y = os.environ['SRC_Y_LOCAL']

ver = ""  #pd.__version__
git = ""
task = "join"
question = "inner join"
l = [os.path.basename(src_x), os.path.basename(src_y)]
data_name = '-'.join(l)
solution = "modin"
fun = "merge"
cache = "TRUE"

print("loading datasets...")

x = pd.read_csv(os.path.basename(src_x))
y = pd.read_csv(os.path.basename(src_y))

print("joining...")

# NotImplementedError: To contribute to Pandas on Ray, please visit github.com/modin-project/modin
gc.collect()
t_start = timeit.default_timer()
ans = x.merge(y, how='inner', on='KEY')
print(ans.shape)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = [ans['X2'].sum(), ans['Y2'].sum()]
chkt = timeit.default_timer() - t_start
write_log(task=task,
예제 #30
0
import matplotlib

matplotlib.use("PS")
import numpy as np  # linear algebra
import modin.pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns  # data visualization library
import matplotlib.pyplot as plt
import time

data = pd.read_csv("data.csv")
data.head()  # head method show only first 5 rows
col = data.columns
print(col)
y = data.diagnosis  # M or B
list = ["Unnamed: 32", "id", "diagnosis"]
x = data.drop(list, axis=1)
x.head()
ax = sns.countplot(y, label="Count")  # M = 212, B = 357
x.describe()
data_dia = y
data = x
data_n_2 = (data - data.mean()) / (data.std())  # standardization
data = pd.concat([y, data_n_2.iloc[:, 0:10]], axis=1)
data = pd.melt(data,
               id_vars="diagnosis",
               var_name="features",
               value_name="value")
plt.figure(figsize=(10, 10))
sns.violinplot(x="features",
               y="value",
               hue="diagnosis",
예제 #31
0
import modin.pandas as pd

exec(open("./helpers.py").read())

ver = modin.__version__
git = modin.__git_revision__
task = "groupby"
solution = "modin"
fun = ".groupby"
cache = "TRUE"

data_name = os.environ['SRC_GRP_LOCAL']
src_grp = os.path.join("data", data_name+".csv")
print("loading dataset %s" % data_name)

x = pd.read_csv(src_grp)

print("grouping...")

# "Groupby with lists of columns not yet supported."
question = "sum v1 by id1" #1
gc.collect()
t_start = timeit.default_timer()
ans = x.groupby(['id1']).agg({'v1':'sum'})
print(ans.shape)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = [ans['v1'].sum()]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)