def compute_linear_predictions(X_train, X_test_int, X_test_other, Y_train, lin_ranked, max_features=100, regularize=0.8, alpha=10.0): max_features = 100 descriptors = loading.get_descriptors(format=True) Y = pd.Panel(items=range(1, 50), major_axis=X_test_other.index, minor_axis=pd.Series(descriptors, name='Descriptor')) for col, descriptor in enumerate(descriptors): # For each descriptor. prog(col, len(descriptors)) X_test = X_test_int if col == 0 else X_test_other n_molecules = X_test.shape[0] est = Ridge(alpha=alpha, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=0.001, solver='auto', random_state=0) features = lin_ranked[col, :][:max_features] for subject in range(1, 50): # Perceptual data for this descriptor. observed = Y_train[subject][descriptor] # Fit the model on the training data with the # 'max_features' features est.fit( X_train.drop('mean_dilution', 1).values[:, features], observed) predicted = est.predict( X_test.values[:, features]) # Predict the test data. Y[subject][descriptor].loc[list(X_test.index)] = predicted # Regularize each subject to the across-subject mean Y_mean = Y.mean(axis=0) for subject in range(1, 50): Y[subject] = Y_mean * regularize + Y[subject] * (1 - regularize) return Y
def rfc_final(X,Y_imp,Y_mask, max_features,min_samples_leaf,max_depth,et,use_mask,trans_weight, trans_params,X_test_int=None,X_test_other=None,Y_test=None,n_estimators=100,seed=0,quiet=False): if X_test_int is None: X_test_int = X if X_test_other is None: X_test_other = X if Y_test is None: Y_test = Y_mask def rfc_maker(n_estimators=n_estimators,max_features=max_features, min_samples_leaf=min_samples_leaf,max_depth=max_depth,et=False): if not et: kls = RandomForestRegressor kwargs = {'oob_score':False} else: kls = ExtraTreesRegressor kwargs = {} return kls(n_estimators=n_estimators, max_features=max_features, min_samples_leaf=min_samples_leaf, max_depth=max_depth, n_jobs=-1, random_state=seed, **kwargs) rfcs = {} for col in range(42): prog(col,42) rfcs[col] = rfc_maker(n_estimators=n_estimators, max_features=max_features[col], min_samples_leaf=min_samples_leaf[col], max_depth=max_depth[col], et=et[col]) if use_mask[col]: rfcs[col].fit(X,Y_mask[:,col]) else: rfcs[col].fit(X,Y_imp[:,col]) predicted = np.zeros((X_test_int.shape[0],42)) for col in range(42): if et[col] or not np.array_equal(X,X_test_int): # Possibly check in-sample fit because there isn't any alternative. if col in [0,21]: predicted[:,col] = rfcs[col].predict(X_test_int) else: predicted[:,col] = rfcs[col].predict(X_test_other) else: try: predicted[:,col] = rfcs[col].oob_prediction_ except AttributeError: if col in [0,21]: predicted[:,col] = rfcs[col].predict(X_test_int) else: predicted[:,col] = rfcs[col].predict(X_test_other) def f_transform(x, k0, k1): return 100*(k0*(x/100)**(k1*0.5) - k0*(x/100)**(k1*2)) for col in range(21): tw = trans_weight[col] k0,k1 = trans_params[col] p_m = predicted[:,col] p_s = predicted[:,col+21] predicted[:,col+21] = tw*f_transform(p_m,k0,k1) + (1-tw)*p_s observed = Y_test score = scoring.score2(predicted,observed) rs = {} for kind in ['int','ple','dec']: rs[kind] = {} for moment in ['mean','sigma']: rs[kind][moment] = scoring.r2(kind,moment,predicted,observed) if not quiet: print("For subchallenge 2:") print("\tScore = %.2f" % score) for kind in ['int','ple','dec']: for moment in ['mean','sigma']: print("\t%s_%s = %.3f" % (kind,moment,rs[kind][moment])) return (rfcs,score,rs)
def rfc_final(X, Y, max_features, min_samples_leaf, max_depth, use_et, regularize=np.ones(21) * 0.8, n_estimators=100, seed=0): descriptors = loading.get_descriptors(format=True) n_subjects = 49 n_obs = X.shape[0] def rfc_maker(n_estimators=n_estimators, max_features=max_features, min_samples_leaf=min_samples_leaf, max_depth=max_depth, use_et=False): if not use_et: kls = RandomForestRegressor kwargs = {'oob_score': True} else: kls = ExtraTreesRegressor kwargs = {} return kls(n_estimators=n_estimators, max_features=max_features, min_samples_leaf=min_samples_leaf, max_depth=max_depth, n_jobs=-1, random_state=seed, **kwargs) rfcs = {x: {} for x in range(1, n_subjects + 1)} for d, descriptor in enumerate(descriptors): for subject in range(1, n_subjects + 1): rfcs[subject][descriptor] = rfc_maker( n_estimators=n_estimators, max_features=max_features[col], min_samples_leaf=min_samples_leaf[col], max_depth=max_depth[col], use_et=use_et[col]) for subject in range(1, n_subjects + 1): prog(subject, n_subjects + 1) from time import gmtime, strftime for d, descriptor in enumerate(descriptors): rfcs[subject][descriptor].fit(X, Y[subject][descriptor]) predicted = np.zeros((n_obs, len(descriptors), n_subjects)) for col, descriptor in enumerate(descriptors): for subject in range(1, n_subjects + 1): if use_et[col]: # Check in-sample fit because there isn't any alternative. predicted[:,col,subject-1] = \ rfcs[subject][descriptor].predict(X) else: predicted[:,col,subject-1] = \ rfcs[subject][descriptor].oob_prediction_ # Regularize: predicted_mean = predicted.mean(axis=2, keepdims=True) predicted_reg = predicted.copy() for d, descriptor in enumerate(descriptors): predicted_reg[:,d,:] = regularize[d]*predicted_mean[:,d,:] \ + (1-regularize[d])*predicted[:,d,:] predicted = predicted_reg observed = predicted.copy() for subject in range(1, n_subjects + 1): observed[:, :, subject - 1] = Y[subject] score = scoring.score(predicted, observed) rs = {} predictions = {} print("For subchallenge 1:") print("\tScore = %.2f" % score) for kind in ['int', 'ple', 'dec']: rs[kind] = scoring.r(kind, predicted, observed) print("\t%s = %.3f" % (kind, rs[kind])) return (rfcs, score, rs)
def rfc_final(X,Y, max_features,min_samples_leaf,max_depth,use_et, regularize=np.ones(21)*0.8,Y_test=None,n_estimators=100,seed=0): def rfc_maker(n_estimators=n_estimators,max_features=max_features, min_samples_leaf=min_samples_leaf,max_depth=max_depth, use_et=False): if not use_et: kls = RandomForestRegressor kwargs = {'oob_score':True} else: kls = ExtraTreesRegressor kwargs = {} return kls(n_estimators=n_estimators, max_features=max_features, min_samples_leaf=min_samples_leaf, max_depth=max_depth, n_jobs=-1, random_state=seed, **kwargs) rfcs = {} for col in range(21): rfcs[col] = {} for subject in range(1,50): rfcs[col][subject] = rfc_maker(n_estimators=n_estimators, max_features=max_features[col], min_samples_leaf=min_samples_leaf[col], max_depth=max_depth[col], use_et=use_et[col]) for subject in range(1,50): prog(subject,50) from time import gmtime, strftime for col in range(21): rfcs[col][subject].fit(X,Y[subject][:,col]) predicted0 = rfcs[0][1].predict(X) predicted = np.zeros((predicted0.shape[0],21,49)) for col in range(21): for subject in range(1,50): if use_et[col]: # Check in-sample fit because there isn't any alternative. predicted[:,col,subject-1] = rfcs[col][subject].predict(X) else: predicted[:,col,subject-1] = rfcs[col][subject].oob_prediction_ # Regularize: predicted_mean = predicted.mean(axis=2,keepdims=True) predicted_reg = predicted.copy() for col in range(21): predicted_reg[:,col,:] = regularize[col]*predicted_mean[:,col,:] \ + (1-regularize[col])*predicted[:,col,:] predicted = predicted_reg observed = predicted.copy() for subject in range(1,50): observed[:,:,subject-1] = Y[subject] score = scoring.score(predicted,observed) rs = {} predictions = {} print("For subchallenge 1:") print("\tScore = %.2f" % score) for kind in ['int','ple','dec']: rs[kind] = scoring.r(kind,predicted,observed) print("\t%s = %.3f" % (kind,rs[kind])) return (rfcs,score,rs)