예제 #1
0
def compute_linear_predictions(X_train,
                               X_test_int,
                               X_test_other,
                               Y_train,
                               lin_ranked,
                               max_features=100,
                               regularize=0.8,
                               alpha=10.0):
    max_features = 100
    descriptors = loading.get_descriptors(format=True)
    Y = pd.Panel(items=range(1, 50),
                 major_axis=X_test_other.index,
                 minor_axis=pd.Series(descriptors, name='Descriptor'))

    for col, descriptor in enumerate(descriptors):  # For each descriptor.
        prog(col, len(descriptors))
        X_test = X_test_int if col == 0 else X_test_other
        n_molecules = X_test.shape[0]
        est = Ridge(alpha=alpha,
                    fit_intercept=True,
                    normalize=False,
                    copy_X=True,
                    max_iter=None,
                    tol=0.001,
                    solver='auto',
                    random_state=0)
        features = lin_ranked[col, :][:max_features]
        for subject in range(1, 50):
            # Perceptual data for this descriptor.
            observed = Y_train[subject][descriptor]
            # Fit the model on the training data with the
            # 'max_features' features
            est.fit(
                X_train.drop('mean_dilution', 1).values[:, features], observed)
            predicted = est.predict(
                X_test.values[:, features])  # Predict the test data.
            Y[subject][descriptor].loc[list(X_test.index)] = predicted

    # Regularize each subject to the across-subject mean
    Y_mean = Y.mean(axis=0)
    for subject in range(1, 50):
        Y[subject] = Y_mean * regularize + Y[subject] * (1 - regularize)
    return Y
예제 #2
0
def rfc_final(X,Y_imp,Y_mask,
              max_features,min_samples_leaf,max_depth,et,use_mask,trans_weight,
              trans_params,X_test_int=None,X_test_other=None,Y_test=None,n_estimators=100,seed=0,quiet=False):
    
    if X_test_int is None:
        X_test_int = X
    if X_test_other is None:
        X_test_other = X
    if Y_test is None:
        Y_test = Y_mask


    def rfc_maker(n_estimators=n_estimators,max_features=max_features,
                  min_samples_leaf=min_samples_leaf,max_depth=max_depth,et=False):
        if not et: 
            kls = RandomForestRegressor
            kwargs = {'oob_score':False}
        else:
            kls = ExtraTreesRegressor
            kwargs = {}

        return kls(n_estimators=n_estimators, max_features=max_features,
                   min_samples_leaf=min_samples_leaf, max_depth=max_depth,
                   n_jobs=-1, random_state=seed, **kwargs)
        
    rfcs = {}
    for col in range(42):
        prog(col,42)
        rfcs[col] = rfc_maker(n_estimators=n_estimators,
                                max_features=max_features[col],
                                min_samples_leaf=min_samples_leaf[col],
                                max_depth=max_depth[col],
                                et=et[col])

        if use_mask[col]:
            rfcs[col].fit(X,Y_mask[:,col])
        else:
            rfcs[col].fit(X,Y_imp[:,col])
    
    predicted = np.zeros((X_test_int.shape[0],42))
    for col in range(42):
        if et[col] or not np.array_equal(X,X_test_int):
            # Possibly check in-sample fit because there isn't any alternative.  
            if col in [0,21]:
                predicted[:,col] = rfcs[col].predict(X_test_int)
            else:
                predicted[:,col] = rfcs[col].predict(X_test_other)
        else:
            try:
                predicted[:,col] = rfcs[col].oob_prediction_
            except AttributeError:
                if col in [0,21]:
                    predicted[:,col] = rfcs[col].predict(X_test_int)
                else:
                    predicted[:,col] = rfcs[col].predict(X_test_other)

    def f_transform(x, k0, k1):
            return 100*(k0*(x/100)**(k1*0.5) - k0*(x/100)**(k1*2))

    for col in range(21):
        tw = trans_weight[col]
        k0,k1 = trans_params[col]
        p_m = predicted[:,col]
        p_s = predicted[:,col+21]
        predicted[:,col+21] = tw*f_transform(p_m,k0,k1) + (1-tw)*p_s
    
    observed = Y_test
    score = scoring.score2(predicted,observed)
    rs = {}
    for kind in ['int','ple','dec']:
        rs[kind] = {}
        for moment in ['mean','sigma']:
            rs[kind][moment] = scoring.r2(kind,moment,predicted,observed)
    
    if not quiet:
        print("For subchallenge 2:")
        print("\tScore = %.2f" % score)
        for kind in ['int','ple','dec']:
            for moment in ['mean','sigma']: 
                print("\t%s_%s = %.3f" % (kind,moment,rs[kind][moment]))
        
    return (rfcs,score,rs)
예제 #3
0
def rfc_final(X,
              Y,
              max_features,
              min_samples_leaf,
              max_depth,
              use_et,
              regularize=np.ones(21) * 0.8,
              n_estimators=100,
              seed=0):

    descriptors = loading.get_descriptors(format=True)
    n_subjects = 49
    n_obs = X.shape[0]

    def rfc_maker(n_estimators=n_estimators,
                  max_features=max_features,
                  min_samples_leaf=min_samples_leaf,
                  max_depth=max_depth,
                  use_et=False):
        if not use_et:
            kls = RandomForestRegressor
            kwargs = {'oob_score': True}
        else:
            kls = ExtraTreesRegressor
            kwargs = {}

        return kls(n_estimators=n_estimators,
                   max_features=max_features,
                   min_samples_leaf=min_samples_leaf,
                   max_depth=max_depth,
                   n_jobs=-1,
                   random_state=seed,
                   **kwargs)

    rfcs = {x: {} for x in range(1, n_subjects + 1)}
    for d, descriptor in enumerate(descriptors):
        for subject in range(1, n_subjects + 1):
            rfcs[subject][descriptor] = rfc_maker(
                n_estimators=n_estimators,
                max_features=max_features[col],
                min_samples_leaf=min_samples_leaf[col],
                max_depth=max_depth[col],
                use_et=use_et[col])

    for subject in range(1, n_subjects + 1):
        prog(subject, n_subjects + 1)
        from time import gmtime, strftime
        for d, descriptor in enumerate(descriptors):
            rfcs[subject][descriptor].fit(X, Y[subject][descriptor])

    predicted = np.zeros((n_obs, len(descriptors), n_subjects))
    for col, descriptor in enumerate(descriptors):
        for subject in range(1, n_subjects + 1):
            if use_et[col]:
                # Check in-sample fit because there isn't any alternative.
                predicted[:,col,subject-1] = \
                    rfcs[subject][descriptor].predict(X)
            else:
                predicted[:,col,subject-1] = \
                    rfcs[subject][descriptor].oob_prediction_

    # Regularize:
    predicted_mean = predicted.mean(axis=2, keepdims=True)
    predicted_reg = predicted.copy()
    for d, descriptor in enumerate(descriptors):
        predicted_reg[:,d,:] = regularize[d]*predicted_mean[:,d,:] \
                               + (1-regularize[d])*predicted[:,d,:]
    predicted = predicted_reg

    observed = predicted.copy()
    for subject in range(1, n_subjects + 1):
        observed[:, :, subject - 1] = Y[subject]
    score = scoring.score(predicted, observed)
    rs = {}
    predictions = {}
    print("For subchallenge 1:")
    print("\tScore = %.2f" % score)
    for kind in ['int', 'ple', 'dec']:
        rs[kind] = scoring.r(kind, predicted, observed)
        print("\t%s = %.3f" % (kind, rs[kind]))

    return (rfcs, score, rs)
예제 #4
0
def rfc_final(X,Y,
              max_features,min_samples_leaf,max_depth,use_et,
              regularize=np.ones(21)*0.8,Y_test=None,n_estimators=100,seed=0):
    
    def rfc_maker(n_estimators=n_estimators,max_features=max_features,
                  min_samples_leaf=min_samples_leaf,max_depth=max_depth,
                  use_et=False):
        if not use_et: 
            kls = RandomForestRegressor
            kwargs = {'oob_score':True}
        else:
            kls = ExtraTreesRegressor
            kwargs = {}

        return kls(n_estimators=n_estimators, max_features=max_features,
                   min_samples_leaf=min_samples_leaf, max_depth=max_depth,
                   n_jobs=-1, random_state=seed, **kwargs)
        
    rfcs = {}
    for col in range(21):
        rfcs[col] = {} 
        for subject in range(1,50):
            rfcs[col][subject] = rfc_maker(n_estimators=n_estimators,
                                max_features=max_features[col],
                                min_samples_leaf=min_samples_leaf[col],
                                max_depth=max_depth[col],
                                use_et=use_et[col])

    for subject in range(1,50):
        prog(subject,50)
        from time import gmtime, strftime
        for col in range(21):
            rfcs[col][subject].fit(X,Y[subject][:,col])
    
    predicted0 = rfcs[0][1].predict(X)
    predicted = np.zeros((predicted0.shape[0],21,49))
    for col in range(21):
        for subject in range(1,50):
            if use_et[col]:
                # Check in-sample fit because there isn't any alternative. 
                predicted[:,col,subject-1] = rfcs[col][subject].predict(X)
            else:
                predicted[:,col,subject-1] = rfcs[col][subject].oob_prediction_

    # Regularize:  
    predicted_mean = predicted.mean(axis=2,keepdims=True)
    predicted_reg = predicted.copy()
    for col in range(21):
        predicted_reg[:,col,:] = regularize[col]*predicted_mean[:,col,:] \
                               + (1-regularize[col])*predicted[:,col,:]
    predicted = predicted_reg
    
    observed = predicted.copy()
    for subject in range(1,50):
        observed[:,:,subject-1] = Y[subject]
    score = scoring.score(predicted,observed)
    rs = {}
    predictions = {}
    print("For subchallenge 1:")
    print("\tScore = %.2f" % score)
    for kind in ['int','ple','dec']:
        rs[kind] = scoring.r(kind,predicted,observed)
        print("\t%s = %.3f" % (kind,rs[kind]))
    
    return (rfcs,score,rs)