示例#1
0
def check_email():
# set script to run at intervals (time.sleep?) - morn, mid-day, afternoon

    current_time = datetime.now()
    previous_check = cpm.unpickle(os.path.join(pkl_dir, 'last_check_time.pkl'))
    print previous_check
    emails = cpm.get_emails('INBOX', previous_check)

    if not emails: # avoid unpickling if empty inbox
        return

    feature_model = cpm.unpickle(os.path.join(pkl_dir, 'final_vec.pkl'))

    classifier_model = cpm.unpickle(os.path.join(pkl_dir, 'final_model.pkl')) 
    
    print 'classifier model', classifier_model

    for email in emails:
        if email.sent_at > previous_check:
            clean_b = cpm.clean_raw_txt(email.body)
            clean_s = cpm.clean_raw_txt(email.subject)
            print 'clean_email', clean_s, clean_b
            email_features = feature_model.transform([clean_b+clean_s])
            print 'email_bag', email_features.shape
            classifier_result = classifier_model.predict(email_features)
            
            eval_email(classifier_result, email)

    cpm.pickle(current_time, os.path.join(pkl_dir,'last_check_time.pkl'))
def main(model_name, X_test=None, y_test=None, models=None, save=False, train_fn='train_split.pkl', model_fn='final_model.pkl'):

    #model_names = ['LogisticRegression', 'MultinomialNB', 'SVC', 'RandomForest', 'GradientBoost']
    if models is None:
        models = [cpm.unpickle(os.path.join(pkl_dir, model_fn))]
    
    if X_test is None:
        X_train, X_test, y_train, y_test = cpm.unpickle(os.path.join(pkl_dir, train_fn))

    X_test = X_test.todense()

    for i, model in enumerate(models):
        model_eval(model_name[i], model, X_test, y_test, save)

    plot_roc_curve(model_name[:len(models)], models, X_test, y_test, save)
def main(model_names, save=False, train_fn='train_split.pkl', model_fn='final_model.pkl', new_data_split=False, random=11):

    if new_data_split:  
        X_train, X_test, y_train, y_test = cpm.create_datasets(save=save, vectorize=True, random=random)
    else:
        X_train, X_test, y_train, y_test = cpm.unpickle(os.path.join(pkl_dir, train_fn))

    X_train = X_train.todense()
    
    #model_names = ['LogisticRegression', 'MultinomialNB', 'SVC', 'RandomForest', 'GradientBoost']

    class_model = { 
        'LogisticRegression': LogisticRegression(C=10000.0 , penalty='l2', class_weight='auto', fit_intercept=True), 
        'MultinomialNB': MultinomialNB(alpha=0.100000, fit_prior=True),
        'SVC': SVC(C=10000.0, kernel='linear', shrinking=True, probability=True, class_weight='auto'),
        'RandomForest': RandomForestClassifier(n_estimators=121, criterion='entropy', bootstrap=True, oob_score=True, max_features='auto'),
        'GradientBoost': GradientBoostingClassifier()
    }

    model_results = []

    for model in model_names:
        model_results.append(setup_model(model, class_model[model], X_train, y_train, save))

    return model_results
def get_feature_names(vectorizer=None, vec_fn='final_vec.pkl'):
    if vectorizer == None:
        vectorizer = cpm.unpickle(os.path.join(pkl_dir, vec_fn))

    return vectorizer.get_feature_names()