예제 #1
0
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

import itertools

from ml_intro.custutils import cross_validate_est, to_path

# data upload
features = pd.read_csv(to_path('final_statement\\features.csv'),
                       index_col='match_id')
# remove match results features
res_features = [
    'duration', 'radiant_win', 'tower_status_radiant', 'tower_status_dire',
    'barracks_status_radiant', 'barracks_status_dire'
]
feat_wo_res = features.drop(columns=res_features)
# find features with skipped values
col_with_skips = feat_wo_res.isnull().sum()
col_with_skips = col_with_skips[col_with_skips > 0]
print(f"Features with missed values:\n{list(col_with_skips.index)}")
# Короткое объяснение - все эти признаки описывают события, которые могли не
# не произойти за первые 5 минут игры
# Все множество признаков с пропусками можно разбить на три группы и дать
# объяснение для каждой группы:
# 1) признаки связанные с событием first blood - событие могло не произойти в
#    заданный отрезок времени (первые 5 минут игры).
예제 #2
0
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

from ml_intro.custutils import to_path

# date, first company, second, ..., thirty company
close_prices = pd.read_csv(to_path('close_prices.csv'), index_col='date')
# Train PCA - i.e transform original 30 features space to n_components space
pca = PCA(n_components=10)
pca.fit(close_prices)
# How many features needed to cover 90% of variance:
# NB! Argmax stops at first max value and returns it's index
expl = pca.explained_variance_ratio_
np.argmax(np.cumsum(expl) > 0.9) + 1
# Transform original data and get first column
approx = [row[0] for row in pca.transform(close_prices)]
# Download Dow-Jones index
djia_data = pd.read_csv(to_path('djia_index.csv'), index_col='date')
djia_data['approx'] = approx
# Calculate pearson correlation
djia_data.corr()
print(f'Pearson correlation: {djia_data.corr().iloc[0, 1]:.2f}')
# another way using numpy
np.corrcoef(djia_data, rowvar=False)
# Company which has most weight in first component
cmp = close_prices.columns[np.argmax(pca.components_[0])]
print(f'Company with most weight in first component is {cmp}')

예제 #3
0
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import log_loss

from ml_intro.custutils import to_path, sigmoid

# data download: set describes biological response for different molecules
# 1-st column says was or not response, others columns (d1 - d1776) describes
# different characteristics of molecules such as shape, size, etc.
gbm_df = pd.read_csv(to_path('gbm-data.csv'))
# Data decomposition
X = gbm_df.iloc[:, 1:]
y = gbm_df.iloc[:, 0]
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.8,
                                                    random_state=241)
# For each learn_rate coef do:
#   fit classifier to train data
#   get decision function for each iteration of GB
#   use it to calc quality on train and test populations and transform by
#   sigmoid
#   pict log-loss


def get_log_loss(clf_instance, X, y):
    """calls method <staged_decision_function(X)>, then iterates over returned
    generator. On every iteration generator returns array of applied decision
예제 #4
0
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Ridge
import re
import scipy

from ml_intro.custutils import to_path

# data format:
# Full description - large text, Location Normalized - categorical town or some place
# ContractTime - categorical type of vacancy, SalaryNormalized - nans
data_train = pd.read_csv(to_path('salary-train.csv'))
data_test = pd.read_csv(to_path('salary-test-mini.csv'))
# Full description conversion:
#   replace all not text data by spaces, also convert to lowercase it.
#   convert by TfidfVectorizer to sparse matrix with l rows and tons of
#   feature columns


def process_text(text):
    return re.sub('[^a-zA-Z0-9]', ' ', text.lower())


data_train['FullDescription'] = data_train['FullDescription'].map(process_text)
data_test['FullDescription'] = data_test['FullDescription'].map(process_text)
tfidf_enc = TfidfVectorizer(min_df=5)
# min_df - ignore elements with freq less then x
X_train_tfidf = tfidf_enc.fit_transform(data_train['FullDescription'])
X_test_tfidf = tfidf_enc.transform(data_test['FullDescription'])
# Location Normalized, ContractTime conversion:
예제 #5
0
import pandas as pd
from sklearn.cluster import KMeans
from skimage.io import imread
from skimage import img_as_float
import pylab
import numpy as np

from ml_intro.custutils import to_path

# read data as numpy array with shape n * m * 3, where n an dm is image sizes
image = imread(to_path('parrots.jpg'))
# show image
pylab.imshow(image)
# skimage float format is value in range [0; 1]
img_float = img_as_float(image)
# Reshape array to construct features-objects: every pixel is object, every
# object has 3 features - R, G, B
# reshape with -1: total array size divided by product of all other listed
# dimensions (in this case last dim is 3)
X = img_float.reshape(-1, img_float.shape[-1])
# data frame will be used to groupby by clusters
clust_data = pd.DataFrame(X, columns=['R', 'G', 'B'])
# define psnr - Peak signal-to-noise ratio


def psnr(y_true, y_pred):
    """both params should be vectors, otherwise reshape(-1) performed
    for details see https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio
    in this implementation assumed that image encoded in float format which
    means max signal value is 1.0"""
    if len(y_true.shape) > 1:
예제 #6
0
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold

from ml_intro.custutils import to_path, cross_validate_est

# Random forest regression
# Predicting age of sea shell by natural observed parameters
# data upload
data = pd.read_csv(to_path('abalone.csv'))
# transform sex feature from text to number: F -> -1, I -> 0, M -> 1
data['Sex'] = data['Sex'].map(lambda s: -1
                              if s == 'F' else (0 if s == 'I' else '1'))
# Separate y and X
y = data['Rings']
X = data.iloc[:, :-1]
# Cross validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# Create Random Forest estimator
n_estimators_range = range(1, 51)
results = cross_validate_est(RandomForestRegressor,
                             X,
                             y,
                             cv=kf,
                             scoring='r2',
                             est_params={'random_state': 1},
                             par_key='n_estimators',
                             par_values=n_estimators_range)
# number of trees which provides score greater then 0.52
tree_num = (results['mean'] > 0.52).idxmax()
print(f'{tree_num}')