def lin_models(lasso=True, traits=['height', 'BMI', 'WHR', 'BHMD', 'SBP'], nbsnps=10000, verbose=0, hot=False, unif=False, reps=1): alpha = [0.01] R = {} for t in traits: print(t) x_tr, x_tst, y_tr, y_tst = retrieve_data(t, nbsnps, unif=unif) x_tr, x_val, y_tr, y_val = train_test_split(x_tr, y_tr, test_size=0.33) if hot: x_tr = convert_to_individual_alleles(x_tr) x_val = convert_to_individual_alleles(x_val) x_tst = convert_to_individual_alleles(x_tst) nb_snps = x_tr.shape[1] res = np.zeros((len(alpha), 3)) n = 0 for a in alpha: print(a) for i in range(0, reps): m = Sequential() if lasso: m.add(Dense(1, input_dim=nb_snps, kernel_regularizer=l1(a))) else: m.add(Dense(1, input_dim=nb_snps, kernel_regularizer=l2(a))) m.compile(loss='mse', optimizer='adam') m.fit(x_tr, y_tr, epochs=1000, callbacks=[EarlyStopping()], validation_data=(x_val, y_val), verbose=verbose) if r(m.predict(x_val).ravel(), y_val)[0] > res[n, 0]: print(r(m.predict(x_val).ravel(), y_val)[0]) print(i) res[n, 0] = r(m.predict(x_val).ravel(), y_val)[0] res[n, 1] = r(m.predict(x_tst).ravel(), y_tst)[0] K.clear_session() print(res[n, 1]) n += 1 R[t + "val"] = res[:, 0] R[t + "tst"] = res[:, 1] R["alpha"] = alpha print(pd.DataFrame(R).to_csv(float_format='%.3f', index=False)) logging.info(pd.DataFrame(R).to_csv(float_format='%.3f', index=False))
def score(self, X, y_prob_true, sample_weight=None): y_prob_pred = self.predict_proba(X) c = [ r(y_prob_pred[i], y_prob_true[i]).correlation for i in range(len(y_prob_true)) ] return np.average(c, weights=sample_weight)
def calculate_individual_rmse(self): rmse_list = [] rmse_perc_pref = [] for start_ind in range(self.data.shape[0])[::5]: counts = [i for i in self.data.iloc[start_ind:start_ind + 5]["Counts"]] pred_values = [self.calculate_value(count_value=count, bmi=self.data.iloc[start_ind]["BMI"]) for count in counts] true_speed = [i for i in self.data.iloc[start_ind:start_ind + 5]["Speed"]] rmse = np.sqrt(metrics.mean_squared_error(y_true=true_speed, y_pred=pred_values)) rmse_list.append(float(rmse)) rmse_perc_pref.append(float(100 * rmse / self.data.iloc[start_ind + 2]["Speed"])) pref_speeds = [self.pref_data.iloc[i]["Speed"] for i in range(self.pref_data.shape[0])] heights = [self.pref_data.iloc[i]["Height"] for i in range(self.pref_data.shape[0])] r_value = round(r(pref_speeds, rmse_perc_pref)[0], 3) rmse_df = pd.DataFrame(list(zip(self.pref_data["Subject"], self.pref_data["Age"], self.pref_data["Height"], self.pref_data["Weight"], self.pref_data["BMI"], pref_speeds, rmse_list, rmse_perc_pref)), columns=["Subject", "Age", "Height", "Weight", "BMI", "Pref Speed", "RMSE", "RMSE (% Pref)"]) corr_mat = rmse_df[["Pref Speed", "RMSE", "RMSE (% Pref)"]].corr() sb.heatmap(corr_mat, cmap="RdYlGn", annot=True) plt.title("Individual Participant Correlations") return rmse_df, corr_mat
# Sorting the input user and current user group so the values aren't mixed up later on group = group.sort_values(by='movieId') inputMovies = inputMovies.sort_values(by='movieId') # Movies that they both have in common temp_df = inputMovies[inputMovies['movieId'].isin( group['movieId'].tolist())] # User input movie ratings in list format to facilitate calculations tempRatingList = temp_df['rating'].tolist() # User group movie ratings also in list format tempGroupList = group['rating'].tolist() # Correlation score pearsonCorrelationDict[userid] = r(tempRatingList, tempGroupList)[0] #%% Exploring the created dictionary corr_df = pd.DataFrame.from_dict(pearsonCorrelationDict, orient="index") corr_df.columns = ["similarity_index"] corr_df["userId"] = corr_df.index corr_df.index = range(len(corr_df)) corr_df.head() #%% Top 50 similar users to input user topUsers_df = corr_df.sort_values(by="similarity_index", ascending=False)[0:50] topUsers_df.head() #%% Now we're going to extract the movies ratings of the selected most similar users topUsers_ratings = topUsers_df.merge(ratings_df, left_on="userId",
def CNN(traits=['height', 'BMI', 'WHR', 'BHMD', 'SBP'], verbose=0, unif=False, nbsnps=10000, p=None, reps=1): #cnn1 param = list({ 'optimizer': 'nadam', 'size_window': 2, 'activation': 'softplus', 'nb_neurons': 64, 'stride': 'one', 'nb_cnn_layers': 1, 'filters': 16, 'weight_decay': 0.0, 'nb_layers': 3, 'dropout': 0.01, 'batch_norm': True }) #cnn2 param.append({ 'optimizer': 'nadam', 'size_window': 2, 'activation': 'elu', 'nb_neurons': 32, 'stride': 'one', 'nb_cnn_layers': 1, 'filters': 32, 'weight_decay': 0.0, 'nb_layers': 3, 'dropout': 0.01, 'batch_norm': False }) #cnn3 param.append({ 'optimizer': 'rmsprop', 'size_window': 3, 'activation': 'linear', 'nb_neurons': 32, 'stride': 'one', 'nb_cnn_layers': 1, 'filters': 16, 'weight_decay': 0.0, 'nb_layers': 1, 'dropout': 0.01, 'batch_norm': False }) R = {} for t in traits: best = 0 print(t) x_tr, x_tst, y_tr, y_tst = retrieve_data(t, nbsnps, unif=unif) x_tr, x_val, y_tr, y_val = train_test_split(x_tr, y_tr, test_size=0.33) n_snps = x_tr.shape[1] x_tr = np.expand_dims(x_tr, axis=2) x_val = np.expand_dims(x_val, axis=2) x_tst = np.expand_dims(x_tst, axis=2) f = os.path.join( os.path.expanduser("~"), 'Code/genomic_cnn/models', "Model_" + t + "_cnn_" + str(n_snps / 1000) + "k" + ("_unif" if unif else "_best") + ".h5") n = 0 if p is None: res = np.zeros((len(param), 2)) for g in param: print(g) for x in range(0, reps): m = compile_model_cnn(g, (n_snps, 1)) m.fit(x_tr, y_tr, epochs=1200, verbose=verbose, validation_data=(x_val, y_val), callbacks=[early_stopper]) if r(m.predict(x_val).ravel(), y_val)[0] > res[n, 0]: print(r(m.predict(x_val).ravel(), y_val)[0]) print(x) res[n, 0] = r(m.predict(x_val).ravel(), y_val)[0] res[n, 1] = r(m.predict(x_tst).ravel(), y_tst)[0] if res[n, 0] > best: print("A better network was found with r: %.3f" % res[n, 0]) print(g) m.save(f) best = res[n, 0] n = n + 1 else: res = np.zeros((reps, 2)) g = param[p] for i in range(0, reps): m = compile_model_cnn(g, (n_snps, 1)) m.fit(x_tr, y_tr, epochs=1200, verbose=verbose, validation_data=(x_val, y_val), callbacks=[early_stopper]) res[i, :] = (r(m.predict(x_val).ravel(), y_val)[0], r(m.predict(x_tst).ravel(), y_tst)[0]) R[t + "_tr"] = res[:, 0] R[t + "_tst"] = res[:, 1] print(pd.DataFrame(R).to_csv(float_format='%.3f', index=False)) logging.info(pd.DataFrame(R).to_csv(float_format='%.3f', index=False))
def MLP(traits=['height', 'BMI', 'WHR', 'BHMD', 'SBP'], verbose=0, unif=False, nbsnps=10000, p=None, reps=1, hot=False): #mlp1 geneparam = list({ 'optimizer': 'rmsprop', 'activation': 'elu', 'nb_neurons': 32, 'weight_decay': 0.01, 'nb_layers': 1, 'dropout': 0.02 }) # mlp2 geneparam.append({ 'optimizer': 'adagrad', 'activation': 'elu', 'nb_neurons': 64, 'weight_decay': 0.01, 'nb_layers': 2, 'dropout': 0.03 }) # mlp3 geneparam.append({ 'optimizer': 'adam', 'activation': 'softplus', 'nb_neurons': 32, 'weight_decay': 0.01, 'nb_layers': 5, 'dropout': 0.02 }) R = {} for t in traits: print(t) best = 0 x_tr, x_tst, y_tr, y_tst = retrieve_data(t, nbsnps, unif=unif) x_tr, x_val, y_tr, y_val = train_test_split(x_tr, y_tr, test_size=0.33) if hot: x_tr = convert_to_individual_alleles(x_tr) x_val = convert_to_individual_alleles(x_val) x_tst = convert_to_individual_alleles(x_tst) n_snps = x_tr.shape[1] f = os.path.join(os.path.expanduser("~"), 'Code/genomic_cnn/models', "Model_" + t + "_mlp_" + str(n_snps / 1000) \ + "kHot" + ("_unif" if unif else "_best") + ".h5") else: n_snps = x_tr.shape[1] f = os.path.join( os.path.expanduser("~"), 'Code/genomic_cnn/models', "Model_" + t + "_mlp_" + str(n_snps / 1000) + "k" + ("_unif" if unif else "_best") + ".h5") n = 0 if p is None: res = np.zeros((len(geneparam), 2)) for g in geneparam: print(g) for x in range(0, reps): m = compile_model_mlp(g, n_snps) m.fit(x_tr, y_tr, epochs=1200, validation_data=(x_val, y_val), callbacks=[early_stopper], verbose=verbose) if r(m.predict(x_val).ravel(), y_val)[0] > res[n, 0]: print(r(m.predict(x_val).ravel(), y_val)[0]) print(x) res[n, 0] = r(m.predict(x_val).ravel(), y_val)[0] res[n, 1] = r(m.predict(x_tst).ravel(), y_tst)[0] if res[n, 0] > best: print("A better network was found with r: %.3f" % res[n, 0]) print(g) m.save(f) best = res[n, 0] K.clear_session() n = n + 1 else: res = np.zeros((reps, 2)) g = geneparam[p] for i in range(0, reps): m = compile_model_mlp(g, n_snps) m.fit(x_tr, y_tr, epochs=1200, verbose=verbose, validation_data=(x_val, y_val), callbacks=[early_stopper]) res[i, :] = (r(m.predict(x_val).ravel(), y_val)[0], r(m.predict(x_tst).ravel(), y_tst)[0]) R[t + "_tr"] = res[:, 0] R[t + "_tst"] = res[:, 1] print(pd.DataFrame(R).to_csv(float_format='%.3f', index=False)) logging.info(pd.DataFrame(R).to_csv(float_format='%.3f', index=False))