from scipy import linalg import matplotlib.pyplot as plt import matplotlib as mpl from sklearn import mixture # # Number of samples per component # n_samples = 500 # # # Generate random sample, two components # np.random.seed(0) # C = np.array([[0., -0.1], [1.7, .4]]) # X = np.r_[np.dot(np.random.randn(n_samples, 2), C), # .7 * np.random.randn(n_samples, 2) + np.array([-6, 3])] df = load_csv_pandas(file_name="data_set/Feature15FillNull3S.csv") sampled_df = df.sample(frac=0.1) features = sampled_df[FEATURES] features_preprocessed = do_preprocessing(features, method="standardization") X = features_preprocessed lowest_bic = np.infty bic = [] n_components_range = range(1, 10) cv_types = ['spherical', 'tied', 'diag', 'full'] for cv_type in cv_types: for n_components in n_components_range: # Fit a mixture of Gaussians with EM gmm = mixture.GMM(n_components=n_components, covariance_type=cv_type) gmm.fit(X)
unique_ids = test_X.index.unique().tolist() result = {} for id in unique_ids: index_of_specific_id = [i for i, j in enumerate(all_ids) if j == id] predictions_of_specific_id = prediction[index_of_specific_id] mean_prediction_of_specific_id = np.mean(predictions_of_specific_id) result[id] = mean_prediction_of_specific_id result_series = pd.Series(result, index=profile_df.index.tolist()) profile_df["%s_pred" % label] = result_series profile_df["%s_diff" % label] = (profile_df[label] - result_series).abs() if __name__ == "__main__": file_name = "data_set/profileId_Cnt" file_exist = os.path.isfile(file_name) if not file_exist: ws4_ohp_chunk_con = mdb.connect(**WS4_DB_OHP_CHUNK) query = "select a .profileId, cnt , age, height, weight from (select profileId, count(*) as cnt from Feature15FillNull3S where movingTarget='SELF_MOVING' and positionTarget= 'SELF_BAG' and ioTarget = 'outdoor' group by profileId) as a left join ( select profileId, age, height, weight from Profile as p ) as pp on a.profileId = pp.profileId;" profile_df = read_sql(query, con=ws4_ohp_chunk_con, index_col="profileId") profile_df.to_csv(file_name) elif file_exist: profile_df = load_csv_pandas(file_name) df = load_csv_pandas(file_name="data_set/Feature15FillNull3SMovingFiltered.csv") for target in ["age", "height", "weight"]: experiment_is_this_rmse_for_user_that_has_many_chunks(df, target) profile_df.to_csv("results/correlation_btw_cnt_and_rmse.csv")