def __init__(self, showAll: bool = False): if not ray.is_initialized(): ray.init(address='auto') warnings.filterwarnings('ignore') if showAll: pd.set_option('display.max_rows', None) pd.set_option('display.max_columns', None) pd.set_option('display.expand_frame_repr', False) pd.set_option('display.width', None) pd.set_option('display.max_colwidth', -1) self.showAll = showAll self.pd = pd
models_path = 'code/tcga-cancer-classification/models/' # Path to the hyperparameter optimization configuration files hyper_opt_config_path = 'code/tcga-cancer-classification/hyperparameter_optimization/' # Add path to the project scripts sys.path.append('code/tcga-cancer-classification/scripts/') # + {"Collapsed": "false"} import modin.pandas as pd # Optimized distributed version of Pandas import data_utils as du # Data science and machine learning relevant methods import Models # Machine learning models # + [markdown] {"Collapsed": "false"} # Allow pandas to show more columns: # + {"Collapsed": "false"} pd.set_option('display.max_columns', 1000) pd.set_option('display.max_rows', 1000) # + [markdown] {"Collapsed": "false"} # Set the random seed for reproducibility: # + {"Collapsed": "false"} du.set_random_seed(42) # + [markdown] {"Collapsed": "false"} # ## Loading the data # + {"Collapsed": "false"} tcga_df = pd.read_csv(f'{data_path}normalized/tcga.csv') tcga_df.head()
color = sns.color_palette() sns.set_style("darkgrid") import warnings def ignore_warn(*args, **kwargs): pass warnings.warn = ignore_warn # ignore annoying warning (from sklearn and seaborn) from scipy import stats from scipy.stats import norm, skew # for some statistics pd.set_option( "display.float_format", lambda x: "{:.3f}".format(x)) # Limiting floats output to 3 decimal points train = pd.read_csv("train.csv") test = pd.read_csv("test.csv") train.head(5) test.head(5) print("The train data size before dropping Id feature is : {} ".format( train.shape)) print("The test data size before dropping Id feature is : {} ".format( test.shape)) train_ID = train["Id"] test_ID = test["Id"] train.drop("Id", axis=1, inplace=True) test.drop("Id", axis=1, inplace=True) print("\nThe train data size after dropping Id feature is : {} ".format( train.shape))
# (2)前x次曝光、后x次曝光到当前的时间差,后x次到当前曝光的时间差是穿越特征,并且是最强的特征; # (3)二阶交叉特征; # (4)embedding。 # 之所以去掉了第一天的数据,有两个原因,一是因为第一组特征(历史信息)在第一天的数据上是空的,二是因为机器资源不够了。 ################################################################################################################## import modin.pandas as pd import numpy as np from sklearn.model_selection import StratifiedKFold from lightgbm.sklearn import LGBMClassifier from sklearn.metrics import f1_score from scipy.stats import entropy from gensim.models import Word2Vec import time import gc pd.set_option('display.max_columns', None) # 减少内存消耗,破机器跑不动 def reduce_mem(df): start_mem = df.memory_usage().sum() / 1024**2 for col in df.columns: col_type = df[col].dtypes if col_type != object: c_min = df[col].min() c_max = df[col].max() if str(col_type)[:3] == 'int': if c_min > np.iinfo(np.int8).min and c_max < np.iinfo( np.int8).max: df[col] = df[col].astype(np.int8) elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(
import ray import modin.pandas as pd import numpy as np ray.init() people = pd.read_csv('peoples.csv', sep='\t', na_values=r'\N') pd.set_option('display.width', 640) pd.set_option("display.max_columns", 25) def first_version(df, service_id, **default): df['service_id'] = service_id columns = { 'tie': 0, 'paid_lastmonth': 0, 'paid_currentmonth': 0, 'charge': 0, 'subsidy': 0 } columns.update(default) df = df.assign(**columns) df['person_id'] = df['person_id'].astype("Int64") df.to_csv(f'abyss_{service_id}.csv', sep='\t', na_rep=r'\N', index=False) def second_version(schema, street_id, locality_id, buildings, service_id, people_dovid, **defaults): street_df = people_dovid[(people_dovid['street_id'] == street_id) & (people_dovid['city_id'] == locality_id)] df = street_df[street_df['building'].isin(buildings)]
import os import gc import timeit import modin.pandas as pd #import pandas as pd df1 = pd.read_csv("df1.csv", dtype={ 'id4': 'category', 'id5': 'category', 'id6': 'category' }) df2 = pd.read_csv("df2.csv", dtype={'id4': 'category', 'id5': 'category'}) print("df1 = \n", df1) print("df2 = \n", df2) df3 = df1.merge(df2, on='id2') print(df3.shape, flush=True) chk = [df3['v1'].sum(), df3['v2'].sum()] print("chk = ", chk) pd.set_option("display.max_rows", 1000000, "display.max_columns", 1000000) print(df3, flush=True)
import os import time import numpy as np import modin.pandas as pd RAND_LOW = 0 RAND_HIGH = 100 NCOLS = 20 NROWS = 100 random_state = np.random.RandomState(seed=42) data = pd.DataFrame({ "col{}".format(i): random_state.randint( RAND_LOW, RAND_HIGH, size=(NROWS) ) for i in range(NCOLS) }) pd.set_option("display.max_rows", None, "display.max_columns", None) print(data)