def fullContentRecommender(df: pd.DataFrame):
    print('in fullContentRecommender')
    extractFeatures(df)
    df['director'] = df['crew'].apply(getDirector)
    features = ['cast', 'keywords', 'genres']

    for feature in features:
        df[feature] = df[feature].apply(get_list)

    features.append('director')
    for feature in features:
        df[feature] = df[feature].apply(removeSpaces)

    df['soup'] = df.apply(stirSoup, axis=1)

    print('before vectorizer')
    count = CountVectorizer(stop_words='english')
    count_matrix = count.fit_transform(df['soup'])
    contentCosineSim = cosine_similarity(count_matrix, count_matrix)
    ut.pickleObject(contentCosineSim, 'Output/Cosine_Sim.pkl')

    features.append('soup')
    df.drop(columns=features, inplace=True)

    print('after vectorizer')
    return contentCosineSim
Exemplo n.º 2
0
def assembleScoreHTML(similarityScores, imdbScores, index: int):
    scoreHTML = []
    scoreHTML.append('<div class=row> <div  class=column>')
    scoreHTML.append('<span> Similarity Score: ' +
                     str(int(similarityScores[index] * 100)) + '%</span>')
    scoreHTML.append('<br><span>IMDB Score: ' +
                     str(ut.roundTraditional(imdbScores[index], 2)) +
                     '</span></div>')
    index += 1

    scoreHTML.append('<div class=row> <div  class=column>')
    scoreHTML.append('<span> Similarity Score: ' +
                     str(int(similarityScores[index] * 100)) + '%</span>')
    scoreHTML.append('<br><span>IMDB Score: ' +
                     str(ut.roundTraditional(imdbScores[index], 2)) +
                     '</span></div>')
    index += 1
    scoreHTML.append('<div class=row> <div  class=column>')
    scoreHTML.append('<span> Similarity Score: ' +
                     str(int(similarityScores[index] * 100)) + '%</span>')
    scoreHTML.append('<br><span>IMDB Score: ' +
                     str(ut.roundTraditional(imdbScores[index], 2)) +
                     '</span></div>')
    scoreHTML.append('</div>')

    scoreHTMLstring = ''.join(scoreHTML)
    return scoreHTMLstring
Exemplo n.º 3
0
def compressDF(df: pd.DataFrame):
    columns = [
        'budget', 'revenue', 'runtime', 'vote_count', 'vote_average',
        'popularity', 'status', 'original_language', 'production_companies',
        'production_countries', 'overview', 'tagline', 'poster_path', 'crew',
        'keywords', 'video', 'cast', 'keywords', 'original_title',
        'belongs_to_collection', 'homepage', 'spoken_languages'
    ]
    ut.dropIfExists(df, columns, inplace=True)
Exemplo n.º 4
0
def readDF(fileName='Output/movies_metadata.csv'):
    df = pd.read_csv(fileName)
    nullRecommendations = ut.getRowsNum(df[df['similarMovies'] == -1])
    print('Null Movie Recommendations: ' + str(nullRecommendations))

    if nullRecommendations > 0:
        time, cosineSim = ut.getExecutionTime(
            lambda: ut.unpickleObject('Output/Cosine_Sim.pkl'))
        ut.getExecutionTime(lambda: getRecommendationsAsColumn(df, cosineSim))
    return df
def predictSalePrice(dfTest: pd.DataFrame, models: Dict):
    continuousColumns = getColumnType(dfTest, 'Continuous', True)

    x = scaleData(dfTest, continuousColumns)
    predictions = pd.DataFrame()

    for regression in models.values():
        prediction = pd.Series(regression.model.predict(x))
        predictions = ut.appendColumns([predictions, prediction])

    salePrice  = predictions.apply(np.exp, axis=1)
    finalPrediction = salePrice.apply(np.mean, axis=1)

    output = pd.DataFrame({'Id': test['Id'].astype(int), 'SalePrice': finalPrediction})
    output.to_csv('Output/Submission.csv', index=False)














# look up randomizedSearchCV vs. GridsearchCV

# use VIF > 5, AIC, BIC for feature selection
# Don't use linear regression on categorical vars
# create ensemble of many different models (check for packages that can do this)
# use linear model on everything, then feature select
Exemplo n.º 6
0
def processMovieData(generateCosineSim: bool):

    ## open dataframe
    df = pd.read_csv('Input/movies_metadata (original).csv')
    df = addImdbScore(df)
    ## drop movies without release date
    df = df.apply(lambda x: x if isinstance(x['release_date'], str) else None,
                  axis=1)
    df.dropna(how='all', inplace=True)

    df['tmdb_id'] = df.index
    df = df.sort_values(by='imdb_score', ascending=False)
    df.reset_index(inplace=True, drop=True)

    df['release_year'] = df['release_date'].apply(lambda date: date[0:4])

    df = df[[
        'tmdb_id', 'id', 'imdb_id', 'imdb_score', 'title', 'release_date',
        'release_year', 'adult', 'genres'
    ]]

    ## merge credits and keywords into movieData
    credits = pd.read_csv('Input/credits.csv')
    credits['id'] = credits['id'].astype('int')
    keywords = pd.read_csv('Input/keywords.csv')
    keywords['id'] = keywords['id'].astype('int')

    df = df[df['id'].str.isnumeric() == True]
    df['id'] = df['id'].astype('int')
    df = df.merge(credits, on='id')
    df = df.merge(keywords, on='id')
    df = df[df.apply(lambda x: isinstance(x['title'], str), axis=1)]
    df['similarMovies'] = -1

    ## Add column of recommended movies
    if generateCosineSim:
        time, cosineSim = ut.getExecutionTime(
            lambda: fullContentRecommender(df))
    else:
        time, cosineSim = ut.getExecutionTime(
            lambda: ut.unpickleObject('Output/Cosine_Sim.pkl'))
    compressDF(df)
    ut.getExecutionTime(lambda: getRecommendationsAsColumn(df, cosineSim))
    return df
Exemplo n.º 7
0
def plotSimilarityScores(df: pd.DataFrame):
    movieList = df['similarMovies']
    movieList = movieList.apply(lambda x: ut.stringToDict(x))
    df['scoreRange'] = movieList.apply(
        lambda x: max(x.values()) - min(x.values()))
    df['scoreAverage'] = movieList.apply(
        lambda x: sum(x.values()) / len(x.values()))

    histDir = 'Output/Histograms/'
    scatterDir = 'Output/Scatterplots/'
    barDir = 'Output/Barplots/'

    histParams = {'kind': 'hist', 'legend': False, 'bins': 50}
    barParams = {'kind': 'bar', 'legend': False}
    figParams = {'x': 7, 'y': 7}

    plt.rc('font', size=40)
    plt.rc('axes', labelsize=60)
    plt.rc('axes', titlesize=60)
    xTickMult = lambda: ut.multiplyRange(plt.xticks()[0], 0.5)
    xTickMultLS = lambda: ut.multiplyLinSpace(plt.xticks()[0], 2)
    yTickFormat = lambda: plt.gca().yaxis.set_major_formatter(
        plt.FormatStrFormatter('%.0f'))
    xTickFormatPercent = lambda: plt.gca().xaxis.set_major_formatter(
        mtick.PercentFormatter(decimals=0))
    xTickFormatCommas = lambda: plt.gca().xaxis.set_major_formatter(
        mpl.ticker.StrMethodFormatter('{x:,.0f}'))
    xTickFormatDollars = lambda x=0: plt.gca().xaxis.set_major_formatter(
        mpl.ticker.StrMethodFormatter('${x:,.' + str(x) + 'f}'))
    # setTickIn = lambda: plt.gca().tick_params(axis='x', direction='in')
    trimTicks = lambda: plt.xticks()[0:-1]
    histParams = {'kind': 'hist', 'legend': False, 'bins': 100}

    ut.plotDF(
        df[['scoreRange']], histParams, {
            'grid': None,
            'xlabel': 'Range Between Highest and Lowest Similarity Scores',
            'title': 'Histogram of Similarity Score Ranges',
            'savefig': histDir + 'ScoreRange.png'
        })

    ut.plotDF(
        df[['scoreAverage']], histParams, {
            'grid': None,
            'xlabel': 'Average Similarity Scores',
            'title': 'Histogram of Similarity Score Averages',
            'savefig': histDir + 'ScoreAverages.png'
        })

    print('finished plotting')
def performRegressions(df: pd.DataFrame):
    models = assembleModels()
    y = df['LogSalePrice']

    continuousColumns = getColumnType(df, 'Continuous', True)
    discreteColumns   = getColumnType(df, 'Discrete', True)
    ordinalColumns = getColumnType(df, 'Ordinal', True)


    continuousColumns.remove('LogSalePrice')
    x = scaleData(df.drop(columns=['LogSalePrice']), continuousColumns)
    #x = df.drop(columns=['LogSalePrice'])
    trainTestData = train_test_split(x, y, test_size=0.3, random_state=0)

    #models['Ridge'].plotHyperParams(*trainTestData, 1)
    # models['Lasso'].plotHyperParams(*trainTestData,2)
    # models['Elastic Net'].plotHyperParams(*trainTestData)

    # models['Ridge'].plotHyperParams(*trainTestData)
    # models['SVM'].plotHyperParams(*trainTestData)
    i=0
    for name, model in models.items():
        model.plotHyperParams(*trainTestData,i)
        i+=1

    models['Linear'].time,         returnValue = ut.getExecutionTime(lambda: models['Linear'].fit(*trainTestData))
    models['Ridge'].time,          returnValue = ut.getExecutionTime(lambda: models['Ridge'].fitCV(*trainTestData))
    models['Lasso'].time,          returnValue = ut.getExecutionTime(lambda: models['Lasso'].fitCV(*trainTestData))
    models['Elastic Net'].time,    returnValue = ut.getExecutionTime(lambda: models['Elastic Net'].fitCV(*trainTestData))

    models['Random Forest'].time,  returnValue = ut.getExecutionTime(lambda: models['Random Forest'].fitCV(*trainTestData))
    models['Gradient Boost'].time, returnValue = ut.getExecutionTime(lambda: models['Gradient Boost'].fitCV(*trainTestData))
    models['SVM'].time,            returnValue = ut.getExecutionTime(lambda: models['SVM'].fitCV(*trainTestData))

    results = pd.DataFrame([r.__dict__ for r in models.values()]).drop(columns=['model', 'modelCV'] )

    roundColumns4Digits = ['trainScore', 'testScore']
    #roundColumns8Digits = ['trainRMSE', 'testRMSE']
    for c in roundColumns4Digits:
        results[c] = results[c].apply(ut.roundTraditional, args = (4,) )

    results.to_excel('Output/Model Results.xlsx')
    print('Finished Regressions')
    return models
def plotResults(
    train,
    modDfTrain,
    models,
):
    ut.plotDF(
        train[['SalePrice']], histParams, {
            xTickFormatDollars: '',
            yTickFormat: '',
            'grid': None,
            'xlabel': 'Sale Price',
            'title': 'Histogram of Sale Price of Houses in Ames, Iowa',
            'savefig': histDir + 'SalePrice.png'
        })

    ut.plotDF(
        modDfTrain[['LogSalePrice']], histParams, {
            yTickFormat: '',
            'grid': None,
            'xlabel': 'Log Sale Price',
            'title': 'Histogram of Log Sale Price of Houses in Ames, Iowa',
            'savefig': histDir + 'LogSalePrice.png'
        })

    nulls = ut.getNullColumns(train)
    nullsP = ut.getNullPercents(train)

    ut.plotDF(nullsP, {
        'kind': 'barh',
        'x': 'Column',
        'y': 'Null Percent',
        'legend': False
    }, {
        'grid': None,
        xTickFormatPercent: '',
        'xlabel': '# of Null Values',
        'title': 'Bar Plot of Null Columns ',
        'savefig': barDir + 'Null Percents.png'
    },
              removeOutliersBeforePlotting=False)

    modelResults = pd.read_excel('Output/Model Results.xlsx')

    ut.plotDF(modelResults.sort_values(by='testScore', ascending=True), {
        'kind': 'barh',
        'x': 'name',
        'y': 'testScore',
        'legend': False
    }, {
        'xticks': xTickMultLS,
        'grid': None,
        'xlabel': 'Test Score',
        'ylabel': 'Model Name',
        'title': 'Bar Plot of Model Scores ',
        'savefig': barDir + 'Model Scores.png'
    },
              removeOutliersBeforePlotting=False)

    modelResults['time'] = modelResults['time'].apply(ut.getTime)
    ut.plotDF(modelResults.sort_values(by='time', ascending=True), {
        'kind': 'barh',
        'x': 'name',
        'y': 'time',
        'legend': False
    }, {
        'xticks': xTickMult,
        'grid': None,
        'xlabel': 'Time (Seconds)',
        'ylabel': 'Model Name',
        'title': 'Bar Plot of Model Times ',
        'savefig': barDir + 'Model Times.png'
    },
              removeOutliersBeforePlotting=False)

    lasso = models['Lasso']
    coefs = lasso.model.coef_
    columns = modDfTrain.drop(columns='LogSalePrice').columns

    lassoCoefs = pd.DataFrame({'Variable': columns, 'Coefficient': coefs})
    lassoCoefs = lassoCoefs[lassoCoefs['Coefficient'] > 0.001].sort_values(
        by='Coefficient', ascending=True)

    ut.plotDF(lassoCoefs, {
        'kind': 'barh',
        'x': 'Variable',
        'y': 'Coefficient',
        'legend': False
    }, {
        'xticks': xTickMultLS,
        'grid': None,
        'xlabel': 'Lasso Coefficient',
        'ylabel': 'Variable Name',
        'title': 'Bar Plot of Lasso Feature Selection ',
        'savefig': barDir + 'Lasso Feature Selection.png'
    },
              removeOutliersBeforePlotting=False)

    gbm = models['Gradient Boost']
    coefs = gbm.model.feature_importances_

    gbmCoefs = pd.DataFrame({
        'Variable': columns,
        'Coefficient': coefs
    }).sort_values(by='Coefficient', ascending=True)
    gbmCoefs = gbmCoefs[gbmCoefs['Coefficient'] > 0.001]

    ut.plotDF(gbmCoefs, {
        'kind': 'barh',
        'x': 'Variable',
        'y': 'Coefficient',
        'legend': False
    }, {
        'xticks': xTickMultLS,
        'grid': None,
        'xlabel': 'Gradient Boost Coefficient',
        'ylabel': 'Variable Name',
        'title': 'Bar Plot of Gradient Boost Feature Selection ',
        'savefig': barDir + 'Gradient Boost Feature Selection.png'
    },
              removeOutliersBeforePlotting=False)

    # ut.plotDF(train[['SalePrice']], {'kind': 'bar', 'x': ,'y': },
    #        {xTickFormatDollars: '',
    #         yTickFormat: '',
    #         'grid': None,
    #         'xlabel': 'Sale Price',
    #         'title': 'Histogram of Sale Price of Houses in Ames, Iowa',
    #         'savefig': histDir + 'SalePrice.png'})

    print('Finished Visualizations')
import matplotlib.ticker as mtick
import pandas as pd

histDir = 'Output/Histograms/'
scatterDir = 'Output/Scatterplots/'
barDir = 'Output/Barplots/'

histParams = {'kind': 'hist', 'legend': False, 'bins': 50}
barParams = {'kind': 'bar', 'legend': False}
figParams = {'x': 7, 'y': 7}

plt.rc('font', size=40)
plt.rc('axes', labelsize=60)
plt.rc('axes', titlesize=60)

xTickMult = lambda: ut.multiplyRange(plt.xticks()[0], 0.5)
xTickMultLS = lambda: ut.multiplyLinSpace(plt.xticks()[0], 2)
yTickFormat = lambda: plt.gca().yaxis.set_major_formatter(
    plt.FormatStrFormatter('%.0f'))
xTickFormatPercent = lambda: plt.gca().xaxis.set_major_formatter(
    mtick.PercentFormatter(decimals=0))
xTickFormatCommas = lambda: plt.gca().xaxis.set_major_formatter(
    mpl.ticker.StrMethodFormatter('{x:,.0f}'))
xTickFormatDollars = lambda x=0: plt.gca().xaxis.set_major_formatter(
    mpl.ticker.StrMethodFormatter('${x:,.' + str(x) + 'f}'))
#setTickIn = lambda: plt.gca().tick_params(axis='x', direction='in')
trimTicks = lambda: plt.xticks()[0:-1]
nullsDir = 'Visualizations/Nulls/'
histParams = {'kind': 'hist', 'legend': False, 'bins': 100}

Exemplo n.º 11
0
featureSelectVIF = False

fullDF = train.append(test)

## impute data
imputed = impute(fullDF.copy())

## performs one hot encoding on nominal vars and label encoding on ordinal vars
categorical = categoricalEncoding(imputed.copy())

## adds or manipulates columns
modDF = dfMods(categorical.copy(), featureSelectVIF)

## adds or modifies columns to prepare for regressions

trainLen = ut.getRowsNum(train)
modDfTrain = modDF.iloc[0:trainLen, ]
modDfTest = modDF.iloc[trainLen:, ].copy()
modDfTest.drop(columns=['LogSalePrice'], inplace=True)

## performs regressions and returns dataframe with output
if generateModels:

    time, models = ut.getExecutionTime(lambda: performRegressions(modDfTrain))
    ut.pickleObject(models, 'Output/models.pkl')
else:
    models = ut.unpickleObject('Output/models.pkl')

# modDfDiffColumns = ut.getColumnDiff(modDF, modTest)
# modDfDiffColumns2 = ut.getColumnDiff(modTest, modDF)
def categoricalEncoding(imputed: pd.DataFrame):

    nominal, ordinal, discrete, continuous = splitDfByCategory(imputed)

    # nominal train:    (1460, 23) test: (1459, 23)
    # ordinal train:    (1460, 21) test: (1459, 21)
    # discrete train:   (1460, 14) test: (1459, 14)
    # continuous train: (1460, 17) test: (1459, 16)

    # nominalCounts = {}
    # for c in nominal.columns:
    #     nominalCounts[c] = nominal[c].value_counts()
    # ut.printDict(nominalCounts, 'Nominal Counts:')
    #
    # ordinalCounts = {}
    # for c in ordinal.columns:
    #     ordinalCounts[c] = ordinal[c].value_counts()
    # ut.printDict(ordinalCounts, "Ordinal Counts:")

    qualityDict = {np.nan: 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
    bsmtFinType = {
        np.nan: -1,
        'Unf': 0,
        'LwQ': 1,
        'Rec': 2,
        'BLQ': 3,
        'ALQ': 4,
        'GLQ': 5
    }
    ordinal['LotShape'].replace({
        'Reg': 0,
        'IR1': 1,
        'IR2': 2,
        'IR3': 3
    },
                                inplace=True)
    ordinal['LandContour'].replace({
        'Lvl': 0,
        'Bnk': 1,
        'HLS': 2,
        'Low': 3
    },
                                   inplace=True)
    ordinal['Utilities'].replace(
        {
            np.nan: -1,
            'ELO': 0,
            'NoSeWa': 1,
            'NoSewr': 2,
            'AllPub': 3
        },
        inplace=True)
    ordinal['LandSlope'].replace({'Gtl': 0, 'Mod': 1, 'Sev': 2}, inplace=True)
    ordinal['ExterQual'].replace(qualityDict, inplace=True)
    ordinal['ExterCond'].replace(qualityDict, inplace=True)

    ordinal['BsmtQual'].replace(qualityDict, inplace=True)
    ordinal['BsmtCond'].replace(qualityDict, inplace=True)
    ordinal['BsmtExposure'].replace(
        {
            np.nan: -1,
            'No': 0,
            'Mn': 1,
            'Av': 2,
            'Gd': 3
        }, inplace=True)
    ordinal['BsmtFinType1'].replace(bsmtFinType, inplace=True)
    ordinal['BsmtFinType2'].replace(bsmtFinType, inplace=True)
    ordinal['HeatingQC'].replace(qualityDict, inplace=True)
    ordinal['KitchenQual'].replace(qualityDict, inplace=True)
    ordinal['FireplaceQu'].replace(qualityDict, inplace=True)
    ordinal['GarageFinish'].replace({
        np.nan: -1,
        'Unf': 0,
        'RFn': 1,
        'Fin': 2
    },
                                    inplace=True)
    ordinal['GarageQual'].replace(qualityDict, inplace=True)
    ordinal['GarageCond'].replace(qualityDict, inplace=True)
    ordinal['PavedDrive'].replace({'N': 0, 'P': 1, 'Y': 2}, inplace=True)
    #ordinal['PoolQC'].replace(qualityDict, inplace=True)
    ordinal['Fence'].replace(
        {
            np.nan: -1,
            'MnWw': 0,
            'GdWo': 1,
            'MnPrv': 2,
            'GdPrv': 3
        },
        inplace=True)

    #ordinal[''].replace({}, inplace=True)

    #nulls = ut.getNulls(ordinal)
    ordinal = ordinal.applymap(np.int64)
    dummies = pd.DataFrame()

    for c in nominal.columns:
        dummy = pd.get_dummies(nominal[c])
        dummies = pd.concat([dummies, dummy], axis=1)

    categoricalFull = ut.appendColumns(
        [dummies, ordinal, discrete, continuous])

    #ut.printNulls(categoricalFull)
    #nulls = ut.getNulls(categoricalFull)
    print('Finished Categorical Encoding', '\n')
    return categoricalFull