Exemplo n.º 1
0
    def __init__(self, showAll: bool = False):
        if not ray.is_initialized():
            ray.init(address='auto')

        warnings.filterwarnings('ignore')

        if showAll:
            pd.set_option('display.max_rows', None)
        pd.set_option('display.max_columns', None)
        pd.set_option('display.expand_frame_repr', False)
        pd.set_option('display.width', None)
        pd.set_option('display.max_colwidth', -1)

        self.showAll = showAll
        self.pd = pd
Exemplo n.º 2
0
models_path = 'code/tcga-cancer-classification/models/'
# Path to the hyperparameter optimization configuration files
hyper_opt_config_path = 'code/tcga-cancer-classification/hyperparameter_optimization/'
# Add path to the project scripts
sys.path.append('code/tcga-cancer-classification/scripts/')

# + {"Collapsed": "false"}
import modin.pandas as pd  # Optimized distributed version of Pandas
import data_utils as du  # Data science and machine learning relevant methods
import Models  # Machine learning models

# + [markdown] {"Collapsed": "false"}
# Allow pandas to show more columns:

# + {"Collapsed": "false"}
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

# + [markdown] {"Collapsed": "false"}
# Set the random seed for reproducibility:

# + {"Collapsed": "false"}
du.set_random_seed(42)

# + [markdown] {"Collapsed": "false"}
# ## Loading the data

# + {"Collapsed": "false"}
tcga_df = pd.read_csv(f'{data_path}normalized/tcga.csv')
tcga_df.head()
Exemplo n.º 3
0
color = sns.color_palette()
sns.set_style("darkgrid")
import warnings


def ignore_warn(*args, **kwargs):
    pass


warnings.warn = ignore_warn  # ignore annoying warning (from sklearn and seaborn)
from scipy import stats
from scipy.stats import norm, skew  # for some statistics

pd.set_option(
    "display.float_format",
    lambda x: "{:.3f}".format(x))  # Limiting floats output to 3 decimal points
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train.head(5)
test.head(5)
print("The train data size before dropping Id feature is : {} ".format(
    train.shape))
print("The test data size before dropping Id feature is : {} ".format(
    test.shape))
train_ID = train["Id"]
test_ID = test["Id"]
train.drop("Id", axis=1, inplace=True)
test.drop("Id", axis=1, inplace=True)
print("\nThe train data size after dropping Id feature is : {} ".format(
    train.shape))
Exemplo n.º 4
0
# (2)前x次曝光、后x次曝光到当前的时间差,后x次到当前曝光的时间差是穿越特征,并且是最强的特征;
# (3)二阶交叉特征;
# (4)embedding。
# 之所以去掉了第一天的数据,有两个原因,一是因为第一组特征(历史信息)在第一天的数据上是空的,二是因为机器资源不够了。
##################################################################################################################

import modin.pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from lightgbm.sklearn import LGBMClassifier
from sklearn.metrics import f1_score
from scipy.stats import entropy
from gensim.models import Word2Vec
import time
import gc
pd.set_option('display.max_columns', None)


# 减少内存消耗,破机器跑不动
def reduce_mem(df):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(
                        np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(
Exemplo n.º 5
0
import ray
import modin.pandas as pd
import numpy as np

ray.init()
people = pd.read_csv('peoples.csv', sep='\t', na_values=r'\N')

pd.set_option('display.width', 640)
pd.set_option("display.max_columns", 25)


def first_version(df, service_id, **default):
    df['service_id'] = service_id
    columns = {
        'tie': 0,
        'paid_lastmonth': 0,
        'paid_currentmonth': 0,
        'charge': 0,
        'subsidy': 0
    }
    columns.update(default)
    df = df.assign(**columns)
    df['person_id'] = df['person_id'].astype("Int64")
    df.to_csv(f'abyss_{service_id}.csv', sep='\t', na_rep=r'\N', index=False)


def second_version(schema, street_id, locality_id, buildings, service_id,
                   people_dovid, **defaults):
    street_df = people_dovid[(people_dovid['street_id'] == street_id)
                             & (people_dovid['city_id'] == locality_id)]
    df = street_df[street_df['building'].isin(buildings)]
Exemplo n.º 6
0
import os
import gc
import timeit
import modin.pandas as pd
#import pandas as pd

df1 = pd.read_csv("df1.csv",
                  dtype={
                      'id4': 'category',
                      'id5': 'category',
                      'id6': 'category'
                  })
df2 = pd.read_csv("df2.csv", dtype={'id4': 'category', 'id5': 'category'})

print("df1 = \n", df1)
print("df2 = \n", df2)
df3 = df1.merge(df2, on='id2')
print(df3.shape, flush=True)
chk = [df3['v1'].sum(), df3['v2'].sum()]
print("chk = ", chk)
pd.set_option("display.max_rows", 1000000, "display.max_columns", 1000000)
print(df3, flush=True)
Exemplo n.º 7
0
import os
import time
import numpy as np
import modin.pandas as pd

RAND_LOW = 0
RAND_HIGH = 100
NCOLS = 20
NROWS = 100

random_state = np.random.RandomState(seed=42)

data = pd.DataFrame({
        "col{}".format(i): random_state.randint(
            RAND_LOW, RAND_HIGH, size=(NROWS)
        )
        for i in range(NCOLS)
})

pd.set_option("display.max_rows", None, "display.max_columns", None)
print(data)