Пример #1
0
def _predict(ids, X, clf, outputFpath):
    """
    predict test data and write to file
    @return predictions
    """

    print 'Prediction data:'
    print_missing_values_info(X)
    X = Imputer().fit_transform(X)

    # res = pandas.DataFrame({'id': ids, 'repeatProbability': clf.predict_proba(X)[:, 1]})
    res = pandas.DataFrame({'id': [int(id) for id in ids], 'repeatProbability': clf.predict(X)})
    res.to_csv(outputFpath, index=False)

    return res
Пример #2
0
def make_data(dataFname, enc, features=None):
    """
    reads x and y data (no imputation, yes feature selection)
    also encodes the categorical features f776 and f777
    @param dataFname: name of the training csv file
    @param features: specific features to use. None by default.
    @param enc: the OneHotEncoder. None for training data, not-None for testing data
    @return xdata, ydata (None if test data), ids, enc (OneHotEncoder for f776 and f777)
    """

    origData = pandas.read_csv(dataFname)
    ids = origData['id']

    # remove unused columns
    if 'Unnamed: 0' in origData.columns: del origData['Unnamed: 0']
    del origData['id']

    # remove "data leakage" columns
    for f in prohobitedFeatures:
        del origData[f]

    # separate into X & y values
    xData = origData[[col for col in origData.columns if not col=='loss']]
    set_vars_as_type(xData, discreteVars, object)
    yVec = origData.loss if 'loss' in origData.columns else None

    # try f528 - f274
    xData['f528f274'] = xData['f528'] - xData['f274']

    # encode the categorical features f776 and f777
    if enc is None:
        enc = OneHotEncoder(n_values=[2, 2])
        enc.fit(xData[['f776', 'f777']])

    xData[['f776_isZero', 'f776_isOne', 'f777_isZero', 'f777_isOne']] = pandas.DataFrame(enc.transform(xData[['f776', 'f777']]).toarray())
    del xData['f776']
    del xData['f777']

    print_missing_values_info(origData)

    # feature selection
    if features:
        filteredXData = xData[features]
    else:   # use ALL features
        filteredXData = xData

    return filteredXData, yVec, ids, enc
Пример #3
0
def classify(X, y, lossString, fit=True):
    """
    train classifier on training data
    @return classifier
    """

    print 'Training data:'
    print_missing_values_info(X)
    X = Imputer().fit_transform(X)

    # clf = LogisticRegression()
    clf = GradientBoostingRegressor(learning_rate=0.1, loss=lossString, n_estimators=1000, subsample=0.9)

    if fit:
        clf.fit(X, y)

    return clf