Exemplo n.º 1
0
from scipy import linalg
import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn import mixture

# # Number of samples per component
# n_samples = 500
#
# # Generate random sample, two components
# np.random.seed(0)
# C = np.array([[0., -0.1], [1.7, .4]])
# X = np.r_[np.dot(np.random.randn(n_samples, 2), C),
#           .7 * np.random.randn(n_samples, 2) + np.array([-6, 3])]

df = load_csv_pandas(file_name="data_set/Feature15FillNull3S.csv")
sampled_df = df.sample(frac=0.1)
features = sampled_df[FEATURES]
features_preprocessed = do_preprocessing(features, method="standardization")
X = features_preprocessed


lowest_bic = np.infty
bic = []
n_components_range = range(1, 10)
cv_types = ['spherical', 'tied', 'diag', 'full']
for cv_type in cv_types:
    for n_components in n_components_range:
        # Fit a mixture of Gaussians with EM
        gmm = mixture.GMM(n_components=n_components, covariance_type=cv_type)
        gmm.fit(X)
Exemplo n.º 2
0
    unique_ids = test_X.index.unique().tolist()

    result = {}
    for id in unique_ids:
        index_of_specific_id = [i for i, j in enumerate(all_ids) if j == id]
        predictions_of_specific_id = prediction[index_of_specific_id]
        mean_prediction_of_specific_id = np.mean(predictions_of_specific_id)

        result[id] = mean_prediction_of_specific_id
    result_series = pd.Series(result, index=profile_df.index.tolist())
    profile_df["%s_pred" % label] = result_series
    profile_df["%s_diff" % label] = (profile_df[label] - result_series).abs()


if __name__ == "__main__":
    file_name = "data_set/profileId_Cnt"
    file_exist = os.path.isfile(file_name)
    if not file_exist:
        ws4_ohp_chunk_con = mdb.connect(**WS4_DB_OHP_CHUNK)
        query = "select a .profileId, cnt , age, height, weight from (select profileId, count(*) as cnt from Feature15FillNull3S where movingTarget='SELF_MOVING' and positionTarget= 'SELF_BAG' and ioTarget = 'outdoor' group by profileId) as a left join ( select profileId, age, height, weight from Profile as p ) as pp on a.profileId = pp.profileId;"
        profile_df = read_sql(query, con=ws4_ohp_chunk_con, index_col="profileId")
        profile_df.to_csv(file_name)
    elif file_exist:
        profile_df = load_csv_pandas(file_name)

    df = load_csv_pandas(file_name="data_set/Feature15FillNull3SMovingFiltered.csv")
    for target in ["age", "height", "weight"]:
        experiment_is_this_rmse_for_user_that_has_many_chunks(df, target)

    profile_df.to_csv("results/correlation_btw_cnt_and_rmse.csv")