def main(): create_submission = True train_regression = False plot_roc = True print 'Reading data...', mosquitos_train_data, weather_data, spray_data = read_mosquitos_data('../input/') print 'Done' mosquitos_train_data_dropped = compact_train_data(mosquitos_train_data) print 'Preprocessing weather data...', weather_data = pre_process_weather(weather_data) print 'Done' # Construct feature vector from the train data: if train_regression: print 'Training mosquitoes predictor regressor...' + bcolors.WARNING train_regressor() print bcolors.ENDC + 'Done' else: print bcolors.OKBLUE + 'Skipped training regressor (assuming already done in the past)' + bcolors.ENDC print 'Constructing feature vectors...', features = create_feature_vector(mosquitos_train_data_dropped, weather_data, spray_data, verbose=1) labels = get_labels(mosquitos_train_data_dropped) print 'Done' # Train data: num_of_folds = 4 #kfold = StratifiedKFold(labels, n_folds=num_of_folds, shuffle=True) kfold = get_folds(mosquitos_train_data_dropped) estimator = create_pipeline() param_grid = {'n_estimators': [500, 750, 1200, 2000], 'min_samples_split': [30, 40, 50]} # param_grid = {'n_estimators': [50, 100, 250, 500, 1000], 'max_depth': [6, 4], 'learning_rate': [0.01, 0.05, 0.1, 0.5], 'max_features': [1.0, 0.5]} # param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [0.1, 1, 10]} # features = scale(features) # true_ratio = np.sum(labels == 1)/float(len(labels)) # false_ratio = np.sum(labels == 0)/float(len(labels)) # samples_weights = np.zeros(len(labels)) # samples_weights[labels == 1] = false_ratio # samples_weights[labels == 0] = true_ratio grid_search_cv = GridSearchCV(estimator, param_grid, scoring='roc_auc', n_jobs=8, cv=kfold, iid=False, verbose=1) #, # fit_params={'sample_weight': samples_weights}) #todo: decide about iid parameter... print 'Training on train data...', print bcolors.WARNING grid_search_cv.fit(features, labels) print bcolors.ENDC print 'Done' print bcolors.HEADER + bcolors.UNDERLINE + '\nClassifier scores:' + bcolors.ENDC print_cv_conclusions(grid_search_cv, features, labels) estimator = grid_search_cv.best_estimator_ if plot_roc: for i, (train, test) in enumerate(kfold): print 'Fitting fold number', i probas = estimator.fit(features[train], labels[train]).predict_proba(features[test]) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(labels[test], probas[:, 1]) # mean_tpr += interp(mean_fpr, fpr, tpr) # mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc)) plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck') # mean_tpr /= len(cv) # mean_tpr[-1] = 1.0 # mean_auc = auc(mean_fpr, mean_tpr) # plt.plot(mean_fpr, mean_tpr, 'k--', # # label='Mean ROC (area = %0.2f)' % mean_auc, lw=2) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic') plt.legend(loc="lower right") plt.show() print 'Refitting model to the entire train data' estimator.fit(features, labels) # refit estimator to the entire data set if create_submission: print bcolors.HEADER + bcolors.UNDERLINE + '\nCreating submission file' + bcolors.ENDC mosquitoes_test_data = pd.read_csv('../input/test.csv') relevant_mosquitoes = (mosquitoes_test_data['Species'] != 'CULEX PIPIENS') & \ (mosquitoes_test_data['Species'] != 'CULEX RESTUANS') & \ (mosquitoes_test_data['Species'] != 'CULEX PIPIENS/RESTUANS') relevant_mosquitoes = np.nonzero(relevant_mosquitoes)[0] print 'Extracting test features...', test_features = create_feature_vector(mosquitoes_test_data, weather_data, spray_data) print 'Done' print 'Predicting probabilities...' end_pointer = 0 start_pointer = 0 m = test_features.shape[0] output_dataframe = pd.DataFrame(columns=['Id', 'WnvPresent']) output_dataframe.to_csv('../output/out.csv', index=False) while end_pointer < m: end_pointer = min(m, start_pointer + 10000) probabilities = np.array(estimator.predict_proba(test_features[start_pointer:end_pointer, :]))[:, 1] # Force zero probability for rare mosquitoes types indexes = relevant_mosquitoes[(relevant_mosquitoes >= start_pointer) & (relevant_mosquitoes < end_pointer)] probabilities[indexes - start_pointer] = 0 ids = np.arange(start_pointer+1, end_pointer+1) output_dataframe = pd.DataFrame(np.column_stack((ids, probabilities)), columns=['Id', 'WnvPresent']) output_dataframe[['Id']] = output_dataframe[['Id']].astype(int) start_pointer = end_pointer # Write to file: output_dataframe.to_csv('../output/out.csv', index=False, mode='a', header=False) print 'Finished ', end_pointer, ' items out of ', m print bcolors.BOLD + bcolors.OKBLUE + 'Submission file ready :)' + bcolors.ENDC
def separated_gaussian_model(): # Arranging the data. # todo: write this in a prettier way (using groupby etc.) an put it in a function mosquitoes_data, weather_data, spray_data = read_mosquitos_data("../input/") weather_data = pre_process_weather(weather_data) # Extract temporal data: dates = np.array([x.split("-") for x in mosquitoes_data["Date"]]) years = np.array(dates[:, 0], dtype=np.int) unique_years = np.unique(years) months = np.array(dates[:, 1], dtype=np.int) days = np.array(dates[:, 2], dtype=np.int) days_in_month = 31 * np.ones(months.shape, dtype=np.int) days_in_month[(months == 6) + (months == 9)] = 30 week_num = (days + days_in_month * (months - 5)) / 7 detailed_temporal_data = mosquitoes_data[["NumMosquitos", "WnvPresent"]] detailed_temporal_data["WeekNum"] = pd.Series(week_num, index=mosquitoes_data.index) detailed_temporal_data["Year"] = pd.Series(years, index=mosquitoes_data.index) temporal_data = detailed_temporal_data.groupby(["Year", "WeekNum"], as_index=False).sum() # Extract weather data: weather_dates = np.array([x.split("-") for x in weather_data["Date"]]) weather_years = np.array(weather_dates[:, 0], dtype=np.int) weather_months = np.array(weather_dates[:, 1], dtype=np.int) weather_days = np.array(weather_dates[:, 2], dtype=np.int) weather_days_in_month = 31 * np.ones(weather_months.shape, dtype=np.int) weather_days_in_month[(weather_months == 6) + (weather_months == 9)] = 30 weather_week_num = (weather_days + weather_days_in_month * (weather_months - 5)) / 7 weather_data["WeekNum"] = pd.Series(weather_week_num, index=weather_data.index) weather_data["Year"] = pd.Series(weather_years, index=weather_data.index) relevant_weather_data = weather_data.groupby(["Year", "WeekNum"], as_index=False).sum() relevant_weather_data["Station"] /= 3 relevant_weather_data["Station"] *= 2 relevant_weather_data[["Tmax", "Tmin", "Depart", "DewPoint", "WetBulb", "PrecipTotal"]] = relevant_weather_data[ ["Tmax", "Tmin", "Depart", "DewPoint", "WetBulb", "PrecipTotal"] ].div(relevant_weather_data["Station"], axis="index") temporal_data[["Tmax", "Tmin", "Depart", "DewPoint", "WetBulb", "PrecipTotal"]] = relevant_weather_data[ ["Tmax", "Tmin", "Depart", "DewPoint", "WetBulb", "PrecipTotal"] ] # print temporal_data # print temporal_data[['NumMosquitos', 'PrecipTotal']] plt.figure() for i, y in enumerate(unique_years): plt.title("Num of mosquitoes and total percip") plt.subplot(2, 4, i + 1) year = np.nonzero(np.array(temporal_data["Year"] == y))[0] plt.plot(temporal_data["WeekNum"][year], temporal_data["NumMosquitos"][year]) plt.ylim([0, 12000]) plt.xlim([0, 25]) plt.subplot(2, 4, i + 5) plt.plot(temporal_data["WeekNum"][year], temporal_data["PrecipTotal"][year]) plt.ylim([0, 1]) plt.xlim([0, 25]) # plt.show() # Extract geo data: mosquitoes_data["Trap"] = np.array([x[1:4] for x in mosquitoes_data["Trap"]], dtype=np.float) mosquitoes_data["Year"] = pd.Series(years, index=mosquitoes_data.index) mosquitoes_yearly_sum = ( mosquitoes_data[["NumMosquitos", "WnvPresent", "Year"]].groupby(["Year"], as_index=False).sum() ) sum_column = None for i, year in enumerate(unique_years): if sum_column is None: sum_column = (mosquitoes_data["Year"] == year) * mosquitoes_yearly_sum["NumMosquitos"][i] else: sum_column += (mosquitoes_data["Year"] == year) * mosquitoes_yearly_sum["NumMosquitos"][i] mosquitoes_data["NumMosquitos"] = mosquitoes_data["NumMosquitos"].div(sum_column) spatial_data = ( mosquitoes_data[["Trap", "Longitude", "Latitude", "NumMosquitos", "WnvPresent"]] .groupby(["Trap", "Longitude", "Latitude"], as_index=False) .sum() ) plt.figure() plt.title("Num mosquitoes VS WNV Presence") num_mosquitoes = np.array(spatial_data[["NumMosquitos", "WnvPresent"]]) plt.plot(num_mosquitoes[:, 0]) plt.plot(num_mosquitoes[:, 1] / np.max(num_mosquitoes[:, 1]), color="r") # plt.show() # Fit temporal data x = np.array(temporal_data[["WeekNum", "Tmax", "Tmin", "Depart", "DewPoint", "WetBulb", "PrecipTotal"]]) x = PolynomialFeatures(2).fit_transform(x) y = np.array(temporal_data["NumMosquitos"]) temporal_regressor = RidgeCV(alphas=np.array([0.01, 0.05, 0.1, 0.5, 1, 5, 10]), scoring="mean_absolute_error") temporal_regressor.fit(x, y) print temporal_regressor.alpha_ print temporal_regressor.coef_ print len(temporal_regressor.coef_) predictions = temporal_regressor.predict(x) plt.figure() plt.plot(y) plt.plot(predictions, color="r") with open( "../output/mosquitoes_count_temporal_regression.pickle", "wb" ) as out_file: # This is the generalized regressor pickle.dump(temporal_regressor, out_file) # Fit spatial data # Init Params longitudes = np.array(spatial_data["Longitude"]) latitudes = np.array(spatial_data["Latitude"]) x = compress_features(longitudes, latitudes) y = np.array(spatial_data["NumMosquitos"]) longitudes_frame = (-88, -87.5) latitudes_frame = (41.6, 42.1) num_of_spatial_centroids = 50 spatial_alpha_vec = np.zeros(num_of_spatial_centroids) # np.random.uniform(-1, 1, num_of_spatial_centroids) spatial_sigma_vec = np.var((-88, -87.5)) * np.ones(num_of_spatial_centroids, dtype=float) spatial_mean_vec = np.column_stack( ( np.random.uniform(longitudes_frame[0], longitudes_frame[1], num_of_spatial_centroids), np.random.uniform(latitudes_frame[0], latitudes_frame[1], num_of_spatial_centroids), ) ) l2_regularization = 0.1 theta_init = compress_params(spatial_alpha_vec, spatial_mean_vec, spatial_sigma_vec) gradient_check(theta_init, x, y, l2_regularization) theta_optimum = fmin_bfgs( calculate_cost, theta_init, fprime=calculate_gradient, args=(x, y, l2_regularization), maxiter=5e4 ) print theta_optimum spatial_alpha_vec, spatial_mean_vec, spatial_sigma_vec = span_params(theta_optimum) print "Mean absolut error: ", (2 * calculate_cost(theta_optimum, x, y, 0) / len(y)) ** 0.5 with open( "../output/mosquitoes_count_spatial_gaussian_regression.pickle", "wb" ) as out_file: # This is the generalized regressor pickle.dump(theta_optimum, out_file) mapdata = np.loadtxt("../input/mapdata_copyright_openstreetmap_contributors.txt") aspect = mapdata.shape[0] * 1.0 / mapdata.shape[1] lon_lat_box = (-88, -87.5, 41.6, 42.1) plt.figure("spatial", figsize=(10, 14)) plt.imshow(mapdata, cmap=plt.get_cmap("gray"), extent=lon_lat_box, aspect=aspect) levels = [0.2, 0.4, 0.6, 0.8, 1.0] lon = np.linspace(-88, -87.5, 1000) lat = np.linspace(41.6, 42.1, 1000) [A, B] = np.meshgrid(lon, lat) g = calculate_spatial_dist( A.reshape(-1), B.reshape(-1), spatial_mean_vec, spatial_sigma_vec, spatial_alpha_vec ).reshape(len(lon), len(lat)) # g = np.zeros((len(lon), len(lat))) # for i in xrange(len(lon)): # for j in xrange(len(lat)): # g[i, j] = calculate_spatial_dist(lon[i], lat[j], spatial_mean_vec, spatial_sigma_vec, spatial_alpha_vec) # contour the gridded data, plotting dots at the randomly spaced data points. # CS = plt.contour(A, B, g, len(levels),linewidths=0.5,colors='k', levels=levels) CS = plt.contour(A, B, g, np.linspace(-0.1, 0.4, 1000)) plt.colorbar() plt.show()