def make_final_sets(): X_train, X_test, y_train, y_test = get_cleaned_train_test_df() full_pipeline = make_final_transformation_pipe() X_train_processed_values = full_pipeline.fit_transform(X_train) X_test_processed_values = full_pipeline.transform(X_test) # Add column names to build the processed dataframe region_ohe_features = list( full_pipeline.named_transformers_["nom"].get_feature_names()) column_names = CONTINUOUS_FEATURES + ORDINAL_FEATURES + region_ohe_features X_train_processed = pd.DataFrame(X_train_processed_values, columns=column_names) X_test_processed = pd.DataFrame(X_test_processed_values, columns=column_names) # Drop one of the ohe features to limit correlations in the data set for df in (X_train_processed, X_test_processed): df.drop("x0_EUROPE", axis=1, inplace=True) # Save the data df_train_processed = X_train_processed.join(y_train.reset_index(drop=True)) df_train_processed.to_pickle(data_path("processed", "train_processed.pkl")) df_test_processsed = X_test_processed.join(y_test.reset_index(drop=True)) df_test_processsed.to_pickle(data_path("processed", "test_processed.pkl")) return df_train_processed, df_test_processsed
def transform_address_test(): from src.utils import data_path from bokeh.models import pd train_path = data_path('train.csv') train_frame = pd.read_csv(train_path) print(train_frame['Address'].apply(__address_to_abbs))
def transform_set(name, train=True): train_path = data_path(name) train_frame = pd.read_csv(train_path) if train: del train_frame['Descript'] del train_frame['Resolution'] del train_frame['Address'] train_frame['X'] = normalize_features(train_frame['X']) train_frame['Y'] = normalize_features(train_frame['Y']) train_frame['Times'] = train_frame['Dates'].apply(transform_normalized_time) train_frame['Year'] = train_frame['Dates'].apply(transform_data_to('year')) train_frame['Month'] = train_frame['Dates'].apply(transform_data_to('month')) del train_frame['Dates'] transformer = OneHotTransformer(categorical(train_frame), train_frame.columns) transformer.fit(train_frame) result = transformer.transform_frame(train_frame) not_regex = "^Dates|^PdDistrict|^DayOfWeek|^Resolution|^X|^Y" train_transformed = result.filter(regex=not_regex) label_transformed = None if train: label_transformed = result.filter(regex="^Category") return train_transformed, label_transformed
def transform_set(name, train=True): train_path = data_path(name) train_frame = pd.read_csv(train_path) categories = None if train: categories = train_frame['Category'] if train: del train_frame['Descript'] del train_frame['Resolution'] del train_frame['Category'] del train_frame['Address'] if not train: del train_frame['Id'] train_frame['X'] = normalize_features(train_frame['X']) train_frame['Y'] = normalize_features(train_frame['Y']) train_frame['Times'] = train_frame['Dates'].apply( transform_normalized_time) # train_frame = transform_address(train_frame) print(train_frame) # train_frame['Year'] = train_frame['Dates'].apply(transform_data_to('year')) # train_frame['Month'] = train_frame['Dates'].apply(transform_data_to('month')) del train_frame['Dates'] transformer = OneHotTransformer(categorical(train_frame), train_frame.columns) transformer.fit(train_frame) train_transformed = transformer.transform_frame(train_frame) return train_transformed, categories
def create_submission(prediction, file_name): with open(data_path(file_name), 'w') as f: f.write("{},{}\n".format("id", ",".join(classes))) for i in range(len(prediction)): f.write("{},{}\n".format(str(i), ",".join([str(j) for j in prediction[i]])))
def transform_set(name, train=True): train_path = data_path(name) train_frame = pd.read_csv(train_path) categories = None if train: categories = train_frame['Category'] if train: del train_frame['Descript'] del train_frame['Resolution'] del train_frame['Category'] del train_frame['Address'] if not train: del train_frame['Id'] train_frame['X'] = normalize_features(train_frame['X']) train_frame['Y'] = normalize_features(train_frame['Y']) train_frame['Times'] = train_frame['Dates'].apply(transform_normalized_time) # train_frame = transform_address(train_frame) print(train_frame) # train_frame['Year'] = train_frame['Dates'].apply(transform_data_to('year')) # train_frame['Month'] = train_frame['Dates'].apply(transform_data_to('month')) del train_frame['Dates'] transformer = OneHotTransformer(categorical(train_frame), train_frame.columns) transformer.fit(train_frame) train_transformed = transformer.transform_frame(train_frame) return train_transformed, categories
def transform_set(name, train=True): train_path = data_path(name) train_frame = pd.read_csv(train_path) if train: del train_frame['Descript'] del train_frame['Resolution'] del train_frame['Address'] train_frame['X'] = normalize_features(train_frame['X']) train_frame['Y'] = normalize_features(train_frame['Y']) train_frame['Times'] = train_frame['Dates'].apply( transform_normalized_time) train_frame['Year'] = train_frame['Dates'].apply(transform_data_to('year')) train_frame['Month'] = train_frame['Dates'].apply( transform_data_to('month')) del train_frame['Dates'] transformer = OneHotTransformer(categorical(train_frame), train_frame.columns) transformer.fit(train_frame) result = transformer.transform_frame(train_frame) not_regex = "^Dates|^PdDistrict|^DayOfWeek|^Resolution|^X|^Y" train_transformed = result.filter(regex=not_regex) label_transformed = None if train: label_transformed = result.filter(regex="^Category") return train_transformed, label_transformed
def load_raw_data(file_name="auto-mpg.data"): file_path = data_path("raw", file_name) return pd.read_csv( file_path, delim_whitespace=True, header=None, names=[ "mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "year", "origin", "name", ], )
def transform_set(name, train=True): train_path = data_path(name) train_frame = pd.read_csv(train_path) categories = None if train: categories = train_frame['Category'] if train: del train_frame['Descript'] del train_frame['Resolution'] del train_frame['Category'] del train_frame['Address'] if not train: del train_frame['Id'] train_frame['X'] = normalize_features(train_frame['X']) train_frame['Y'] = normalize_features(train_frame['Y']) train_frame['Times'] = train_frame['Dates'].apply(transform_normalized_time) # train_frame = transform_address(train_frame) # print(train_frame) train_frame['Year'] = train_frame['Dates'].apply(transform_data_to('year')) print("sdfasdfsadfasdfasdf") print(train_frame.columns) # train_frame['Month'] = train_frame['Dates'].apply(transform_data_to('month')) del train_frame['Dates'] transformer = OneHotTransformer(categorical(train_frame), train_frame.columns) transformer.fit(train_frame) train_transformed = transformer.transform_frame(train_frame) label_transformed = None if train: values = sorted(list(set(categories))) mapping = {value: index for index, value in enumerate(values)} label_transformed = [mapping[cat] for cat in categories] # print(train_transformed.columns) return train_transformed, label_transformed
import pandas as pd from src.utils import data_path from collections import Counter df = pd.read_csv(data_path('train.csv')) # columns = list(df.columns) # # from itertools import groupby # import re # # grouped = groupby(columns, key=lambda name: re.split('_|[0-9]', name)[0]) # # for key, group in grouped: # print(key, list(group)) def desc(name): var = df[[name]] varV = df[name] print(set(varV)) print(var.describe()) print(len(var)) print(len(var[varV == -999999])) counter = Counter(varV) print(counter.most_common()) # desc('var3') # desc('var36')
from sklearn.linear_model import LogisticRegression import pandas as pd from src.submission import make_submission from src.utils import data_path from sklearn import metrics, cross_validation df = pd.read_csv(data_path('train.csv')) df_test = pd.read_csv(data_path('test.csv')) clf = LogisticRegression() target = df['TARGET'] del df['TARGET'] scores = cross_validation.cross_val_score(clf, df, target, cv=5, scoring='log_loss') print(scores.all()) print(scores) print(scores.mean()) clf.fit(df, target) print(len(df_test)) print(len(clf.predict_proba(df_test))) prediction = [pred for _, pred in clf.predict_proba(df_test)] # make_submission('baseline.csv', df_test['ID'], prediction)
def get_cleaned_train_test_df(): clean_data_path = data_path("interim", "data_cleaned.pkl") df = pd.read_pickle(clean_data_path) X = df.drop("mpg", axis=1) y = df["mpg"] return train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
import re from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import Pipeline from src.utils import data_path, setup import pandas as pd setup() train_path = data_path('train.csv') train_frame = pd.read_csv(train_path) train_frame['Descript'] = train_frame['Descript'].apply(lambda des: re.sub('[\(\),]', '', des)) text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ['clf', MultinomialNB()], ]) text_clf = text_clf.fit(train_frame['Descript'], train_frame["Category"]) print(train_frame.ix[0]['Descript'], train_frame.ix[0]['Category']) prediction = text_clf.predict_proba(train_frame['Descript']) print(prediction[0]) print(text_clf.classes_) # TODO: validate
def make_submission(file_name, test_id, prediction): with open(data_path(file_name), 'w') as f: f.write("ID,TARGET\n") for id, pred in zip(test_id, prediction): if hasattr(pred, '__iter__'): pred = pred[1] f.write('{},{}\n'.format(str(id), str(pred)))
def create_dataset(paths): frames = [pd.read_csv(data_path(path)) for path in paths] return pd.concat(frames, axis=1)
# from sklearn.externals import joblib import numpy as np import pandas as pd # joblib.dump(clf, 'filename.pkl') from src.utils import data_path train = data_path('train.csv')['Category'] labels = train['Category'] del train classifiers_outputs_test = ['train1.pkl'] classifiers_outputs_train = ['test1.pkl'] def create_dataset(paths): frames = [pd.read_csv(data_path(path)) for path in paths] return pd.concat(frames, axis=1) train_set = create_dataset(classifiers_outputs_train) train_set.to_csv('join_train.csv') test_set = create_dataset(classifiers_outputs_test) test_set.to_csv('join_test.csv')
from sklearn.preprocessing import MultiLabelBinarizer from collections import Counter from src.submission import create_submission from src.utils import data_path, setup import pandas as pd import numpy as np setup(pd) def to_singleton(iterable): return [[elem] for elem in iterable] train_path = data_path('train.csv') train_frame = pd.read_csv(data_path('train.csv')) submission_size = 884262 classes = ['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY', 'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE', 'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION', 'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING', 'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING', 'MISSING PERSON', 'NON-CRIMINAL', 'OTHER OFFENSES', 'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE', 'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE', 'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY', 'SUICIDE', 'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT', 'WARRANTS', 'WEAPON LAWS'] category = train_frame['Category'] mapping = {clazz: num for (num, clazz) in enumerate(classes)} most_freq_class = Counter(category).most_common()[0][0]
import numpy as np from sklearn import cross_validation from sklearn.decomposition import PCA, KernelPCA import pandas as pd from sklearn.decomposition import KernelPCA from sklearn.ensemble import RandomForestClassifier from sklearn.pipeline import Pipeline from sklearn.grid_search import GridSearchCV from xgboost import XGBClassifier from sklearn.linear_model import LogisticRegression from src.utils import data_path df = pd.read_csv(data_path('train.csv')) df_test = pd.read_csv(data_path('test.csv')) target = df['TARGET'] del df['TARGET'] # del df['ID'] id = df_test['ID'] # del df_test['ID'] pca = PCA(n_components=250) train_pcaed = pca.fit_transform(df, target) random_forest = RandomForestClassifier(n_estimators=30, max_depth=5, max_features=20) random_forest.fit(train_pcaed, target) forested = random_forest.predict_proba(train_pcaed) # pipe = Pipeline(steps=[('pca', pca), ('random_forest', random_forest)])
from src.utils import data_path, setup import pandas as pd import numpy as np setup(pd) train_path = data_path('train.csv') train_frame = pd.read_csv(data_path('train.csv')) columns = list(train_frame.columns) print(columns) # print(train_frame['Address'].describe()) # no nulls in data print(train_frame.ix[0:4]) print(set(train_frame['DayOfWeek'])) # # print(train_frame[['X', 'Y']].describe()) # # from sklearn.preprocessing import Normalizer, maxabs_scale, minmax_scale # # normalizer = Normalizer() # # X = train_frame[['X']] # # normalized = normalizer.fit_transform(train_frame[['X']]) # # print(np.unique(normalized))