import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.ensemble import GradientBoostingClassifier from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import StandardScaler from sklearn.model_selection import KFold import itertools from ml_intro.custutils import cross_validate_est, to_path # data upload features = pd.read_csv(to_path('final_statement\\features.csv'), index_col='match_id') # remove match results features res_features = [ 'duration', 'radiant_win', 'tower_status_radiant', 'tower_status_dire', 'barracks_status_radiant', 'barracks_status_dire' ] feat_wo_res = features.drop(columns=res_features) # find features with skipped values col_with_skips = feat_wo_res.isnull().sum() col_with_skips = col_with_skips[col_with_skips > 0] print(f"Features with missed values:\n{list(col_with_skips.index)}") # Короткое объяснение - все эти признаки описывают события, которые могли не # не произойти за первые 5 минут игры # Все множество признаков с пропусками можно разбить на три группы и дать # объяснение для каждой группы: # 1) признаки связанные с событием first blood - событие могло не произойти в # заданный отрезок времени (первые 5 минут игры).
import pandas as pd import numpy as np from sklearn.decomposition import PCA from ml_intro.custutils import to_path # date, first company, second, ..., thirty company close_prices = pd.read_csv(to_path('close_prices.csv'), index_col='date') # Train PCA - i.e transform original 30 features space to n_components space pca = PCA(n_components=10) pca.fit(close_prices) # How many features needed to cover 90% of variance: # NB! Argmax stops at first max value and returns it's index expl = pca.explained_variance_ratio_ np.argmax(np.cumsum(expl) > 0.9) + 1 # Transform original data and get first column approx = [row[0] for row in pca.transform(close_prices)] # Download Dow-Jones index djia_data = pd.read_csv(to_path('djia_index.csv'), index_col='date') djia_data['approx'] = approx # Calculate pearson correlation djia_data.corr() print(f'Pearson correlation: {djia_data.corr().iloc[0, 1]:.2f}') # another way using numpy np.corrcoef(djia_data, rowvar=False) # Company which has most weight in first component cmp = close_prices.columns[np.argmax(pca.components_[0])] print(f'Company with most weight in first component is {cmp}')
import pandas as pd import numpy as np from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier from sklearn.metrics import log_loss from ml_intro.custutils import to_path, sigmoid # data download: set describes biological response for different molecules # 1-st column says was or not response, others columns (d1 - d1776) describes # different characteristics of molecules such as shape, size, etc. gbm_df = pd.read_csv(to_path('gbm-data.csv')) # Data decomposition X = gbm_df.iloc[:, 1:] y = gbm_df.iloc[:, 0] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=241) # For each learn_rate coef do: # fit classifier to train data # get decision function for each iteration of GB # use it to calc quality on train and test populations and transform by # sigmoid # pict log-loss def get_log_loss(clf_instance, X, y): """calls method <staged_decision_function(X)>, then iterates over returned generator. On every iteration generator returns array of applied decision
import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction import DictVectorizer from sklearn.linear_model import Ridge import re import scipy from ml_intro.custutils import to_path # data format: # Full description - large text, Location Normalized - categorical town or some place # ContractTime - categorical type of vacancy, SalaryNormalized - nans data_train = pd.read_csv(to_path('salary-train.csv')) data_test = pd.read_csv(to_path('salary-test-mini.csv')) # Full description conversion: # replace all not text data by spaces, also convert to lowercase it. # convert by TfidfVectorizer to sparse matrix with l rows and tons of # feature columns def process_text(text): return re.sub('[^a-zA-Z0-9]', ' ', text.lower()) data_train['FullDescription'] = data_train['FullDescription'].map(process_text) data_test['FullDescription'] = data_test['FullDescription'].map(process_text) tfidf_enc = TfidfVectorizer(min_df=5) # min_df - ignore elements with freq less then x X_train_tfidf = tfidf_enc.fit_transform(data_train['FullDescription']) X_test_tfidf = tfidf_enc.transform(data_test['FullDescription']) # Location Normalized, ContractTime conversion:
import pandas as pd from sklearn.cluster import KMeans from skimage.io import imread from skimage import img_as_float import pylab import numpy as np from ml_intro.custutils import to_path # read data as numpy array with shape n * m * 3, where n an dm is image sizes image = imread(to_path('parrots.jpg')) # show image pylab.imshow(image) # skimage float format is value in range [0; 1] img_float = img_as_float(image) # Reshape array to construct features-objects: every pixel is object, every # object has 3 features - R, G, B # reshape with -1: total array size divided by product of all other listed # dimensions (in this case last dim is 3) X = img_float.reshape(-1, img_float.shape[-1]) # data frame will be used to groupby by clusters clust_data = pd.DataFrame(X, columns=['R', 'G', 'B']) # define psnr - Peak signal-to-noise ratio def psnr(y_true, y_pred): """both params should be vectors, otherwise reshape(-1) performed for details see https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio in this implementation assumed that image encoded in float format which means max signal value is 1.0""" if len(y_true.shape) > 1:
import pandas as pd from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import KFold from ml_intro.custutils import to_path, cross_validate_est # Random forest regression # Predicting age of sea shell by natural observed parameters # data upload data = pd.read_csv(to_path('abalone.csv')) # transform sex feature from text to number: F -> -1, I -> 0, M -> 1 data['Sex'] = data['Sex'].map(lambda s: -1 if s == 'F' else (0 if s == 'I' else '1')) # Separate y and X y = data['Rings'] X = data.iloc[:, :-1] # Cross validation setup kf = KFold(n_splits=5, shuffle=True, random_state=1) # Create Random Forest estimator n_estimators_range = range(1, 51) results = cross_validate_est(RandomForestRegressor, X, y, cv=kf, scoring='r2', est_params={'random_state': 1}, par_key='n_estimators', par_values=n_estimators_range) # number of trees which provides score greater then 0.52 tree_num = (results['mean'] > 0.52).idxmax() print(f'{tree_num}')