示例#1
0
        get_ipython().magic("time print(np.sqrt(-cross_val_score(model, trainingData, trainingData['isSpam'], cv=10, scoring='mean_squared_error')).mean())")
        return
            
    bestRun = []
    for _ in range(20):
        rand = RandomizedSearchCV(model, paramDistribution, cv=10, scoring = 'accuracy', n_iter = 10)
        rand.fit(trainingData, trainingData['isSpam'])
        # examine the best model
        bestRun.append({'score' : round(rand.best_score_,3), 'params' : rand.best_params_})
    print(max(bestRun, key=lambda x:x['score']))
    return max(bestRun, key=lambda x:x['score'])
    


#read site
rawSite = read_mongo(db = 'CB', collection = 'site', host = 'localhost', no_id = False)
siteModified = rawSite.drop(['dismissedOnboarding', 'feedCounter', 'feedToken',
              'modules', 'password', 'theme', 'photoId', 'requestAccess',
              'requestPassword', 'bi', 'photo', 'goFundMe', 'lastName',
              'numAmps', 'partner', 'size', 'theme', 'createFormSessionId', 'allowList',
              'blockList', 'displayEmail', 'isPhotoOrderingFixed', 'healthCondition',
              'spam', 'status', 'firstName', 'lastInvite', 'isDeleted',
              'hasCommentFix','age'], axis = 1)
viewPort = siteModified.cm.apply(pd.Series).fillna(-1)
siteModified['hasJavaScriptOn'] = [0 if vp == -1 else 1 for vp in viewPort.vpw]
siteModified.drop(['cm'], axis = 1, inplace = True)
siteModified['descriptionLen'] = rawSite.description.str.len()
siteModified.drop(['description'], axis = 1, inplace = True)
siteModified['nameLen'] = rawSite.name.str.len()
siteModified.drop(['name'], axis = 1, inplace = True)
siteModified['titleLen'] = rawSite.title.str.len()
示例#2
0
    final = pd.concat([test, dfWithClass], axis=1)
    #take a look at the confusion matrix
    print(pd.crosstab(final.isSpam, final.predictedClass))
    print("0s: %d, 1s: %d" %
          (np.sum((final.isSpam == 0) & (final.predictedClass == 0)),
           np.sum((final.isSpam == 1) & (final.predictedClass == 1))))
    print(
        "Accuracy: %.3f" %
        float(np.sum(final.isSpam == final.predictedClass) / float(len(test))))
    print("Precision: %.3f" % float(
        np.sum((final.isSpam == 1) & (final.predictedClass == 1)) /
        np.sum(final.isSpam == 1)))


#read journals
rawJournals = read_mongo(db='CB', collection='journal', host='localhost')
journals = pd.DataFrame(list(rawJournals['body']), columns=['content'])
journals['siteId'] = rawJournals['siteId']
journals['text'] = rawJournals['title'].astype(str) + ' ' + journals['content']
journals.drop(['content'], inplace=True, axis=1)

#read siteIds
rawSite = read_mongo(db='CB', collection='site', host='localhost', no_id=False)
siteIds = pd.DataFrame(list(rawSite['_id']), columns=['siteId'])
siteIds['isSpam'] = rawSite['isSpam']
siteIds.isSpam.fillna(0, inplace=True)
siteIds.rename(columns={'isSpam': 'isSiteSpam'}, inplace=True)

#spam data from file
octSiteProfileSpam = pd.read_csv(
    "/Users/dmurali/Documents/spamlist_round25_from_20150809_to_20151015.csv",
示例#3
0
                                  paramDistribution,
                                  cv=10,
                                  scoring='accuracy',
                                  n_iter=10)
        rand.fit(trainingData, trainingData['isSpam'])
        # examine the best model
        bestRun.append({
            'score': round(rand.best_score_, 3),
            'params': rand.best_params_
        })
    print(max(bestRun, key=lambda x: x['score']))
    return max(bestRun, key=lambda x: x['score'])


#read site
rawSite = read_mongo(db='CB', collection='site', host='localhost', no_id=False)
siteModified = rawSite.drop([
    'dismissedOnboarding', 'feedCounter', 'feedToken', 'modules', 'password',
    'theme', 'photoId', 'requestAccess', 'requestPassword', 'bi', 'photo',
    'goFundMe', 'lastName', 'numAmps', 'partner', 'size', 'theme',
    'createFormSessionId', 'allowList', 'blockList', 'displayEmail',
    'isPhotoOrderingFixed', 'healthCondition', 'spam', 'status', 'firstName',
    'lastInvite', 'isDeleted', 'hasCommentFix', 'age'
],
                            axis=1)
viewPort = siteModified.cm.apply(pd.Series).fillna(-1)
siteModified['hasJavaScriptOn'] = [0 if vp == -1 else 1 for vp in viewPort.vpw]
siteModified.drop(['cm'], axis=1, inplace=True)
siteModified['descriptionLen'] = rawSite.description.str.len()
siteModified.drop(['description'], axis=1, inplace=True)
siteModified['nameLen'] = rawSite.name.str.len()
   
    
    predictor.fit(train, train['isSpam'])
    predicted = predictor.predict(test)
    
    dfWithClass = pd.DataFrame(predicted, columns = ['predictedClass'])
    final = pd.concat([test, dfWithClass], axis=1)
    #take a look at the confusion matrix
    print(pd.crosstab(final.isSpam, final.predictedClass))
    print("0s: %d, 1s: %d" %(np.sum((final.isSpam == 0) & (final.predictedClass == 0)), np.sum((final.isSpam == 1) & (final.predictedClass == 1))))
    print("Accuracy: %.3f" %float(np.sum(final.isSpam == final.predictedClass) / float(len(test))))
    print("Precision: %.3f" %float(np.sum((final.isSpam == 1) & (final.predictedClass == 1)) / np.sum(final.isSpam == 1)))
    

#read journals
rawJournals = read_mongo(db = 'CB', collection = 'journal', host = 'localhost')
journals = pd.DataFrame(list(rawJournals['body']), columns = ['content'])
journals['siteId'] = rawJournals['siteId']
journals['text'] = rawJournals['title'].astype(str) + ' ' + journals['content']
journals.drop(['content'], inplace = True, axis = 1)

#read siteIds
rawSite = read_mongo(db = 'CB', collection = 'site', host = 'localhost', no_id = False)
siteIds = pd.DataFrame(list(rawSite['_id']), columns = ['siteId'])
siteIds['isSpam'] = rawSite['isSpam']
siteIds.isSpam.fillna(0, inplace = True)
siteIds.rename(columns = {'isSpam':'isSiteSpam'}, inplace = True)

#spam data from file
octSiteProfileSpam = pd.read_csv("/Users/dmurali/Documents/spamlist_round25_from_20150809_to_20151015.csv",
                    usecols = ['siteId','isSpam'])