Python clean_data示例，main.clean_data Python示例

示例#1

0

显示文件

文件： tests.py 项目： funshoelias/idb

 def test_name_parse1(self):
     sub = {'name': 'lmao_390', 'image': 'x', 'provider': 'Authentic Jobs'}
     type_ = 2
     res = main.clean_data(type_, sub)
     self.assertTrue(res['name'] == 'lmao')
     # multiple underscores
     sub2 = {
         'name': 'lmao_lmao_lmao_390',
         'image': 'x',
         'provider': 'Authentic Jobs'
     }
     res2 = main.clean_data(type_, sub2)
     self.assertTrue(res2['name'] == 'lmao_lmao_lmao')

示例#2

0

显示文件

文件： process_task.py 项目： guojianyu/top_project

 def excutor_task(self, myqueue):  # 执行任务
     while True:
         try:
             task = myqueue.get()
             if task:
                 # 将任务状态更改为执行状态
                 print(task)
                 main.clean_data().run(task)
             else:
                 time.sleep(1)
             # 执行任务动作
         except Exception as e:
             print('执行任务异常：', e)

示例#3

0

显示文件

 def excutor_task(self, myqueue):  # 执行任务
     thread_all = []
     for _ in range(10):
         t = threading.Thread(target=main.clean_data().run())
         t.start()
         thread_all.append(t)
     for thr in thread_all:
         thr.join()

示例#4

0

显示文件

文件： app.py 项目： johnelmer/Titanic-Machine-Learning

def train_model():
    train = pd.read_csv('./data/train.csv')
    y = train.pop('Survived')
    train = main.clean_data(train)
    train = main.one_hot_encode(train)
    model = main.build_model()
    model.fit(train, y)
    return model

示例#5

0

显示文件

文件： tests.py 项目： funshoelias/idb

 def test_image3(self):
     sub = {'name': 'x', 'image': 'null', 'provider': 'Github Jobs'}
     type_ = 2
     res = main.clean_data(type_, sub)
     self.assertTrue(res['image'] != 'null')
     self.assertTrue(
         res['image'] ==
         'https://pbs.twimg.com/profile_images/625760778554093568/dM7xD4SQ_400x400.png'
     )

示例#6

0

显示文件

文件： tests.py 项目： funshoelias/idb

    def test_image1(self):
        sub = {
            'name': 'x',
            'image':
            'https://udemy-images.udemy.com/course/125_H/364426_2991_5.jpg',
            'provider': 'Udemy'
        }
        type_ = 1
        type_2 = 0

        res = main.clean_data(type_, sub)
        self.assertTrue('125_H' not in res['image'])
        self.assertTrue(
            res['image'] == sub['image'].replace('125_H', '480x270'))

        res2 = main.clean_data(type_2, sub)
        self.assertTrue('125_H' not in res2['image'])
        self.assertTrue(
            res2['image'] == sub['image'].replace('125_H', '480x270'))

示例#7

0

显示文件

文件： app.py 项目： johnelmer/Titanic-Machine-Learning

def index():
    if request.method == 'POST':
        f = request.files['file']
        if f and validate_files(f):
            model = train_model()
            test = pd.read_csv(f)
            ids = test['PassengerId']
            test = main.clean_data(test)
            test = main.one_hot_encode(test)
            result = pd.DataFrame({'PassengerId': ids, 'Survived': model.predict(test)})
            return result.to_html(index = False)

    return '''

示例#8

0

显示文件

文件： tests.py 项目： funshoelias/idb

    def test_image2(self):
        sub = {
            'name': 'x_',
            'image':
            'https://d2fcz5no062gar.cloudfront.nethttps://authenticjobs.s3.amazonaws.com/uploads/logos/lbvf6cci6jno2f4tzl2nsoip4xoam1n9/thumb/logo.png',
            'provider': 'Authentic Jobs'
        }
        sub_empty = {
            'name': 'x_',
            'image':
            'https://d2fcz5no062gar.cloudfront.net/uploads/logos/lbvf6cci6jno2f4tzl2nsoip4xoam1n9/thumb/company-blank.png',
            'provider': 'Authentic Jobs'
        }
        type_ = 2
        res = main.clean_data(type_, sub)
        self.assertTrue(len(res['image'].split("https://")) <= 2)
        self.assertTrue(res['image'] == sub['image'].replace(
            'https://authenticjobs.s3.amazonaws.com', ''))

        res2 = main.clean_data(type_, sub_empty)
        self.assertTrue('company-blank.png' not in res2['image'])
        self.assertTrue(
            res2['image'] == 'https://i.vimeocdn.com/portrait/3831018_300x300')

示例#9

0

显示文件

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
plt.style.use('ggplot')

if __name__ == '__main__':
    data = clean_data(pd.read_csv("data/churn_train.csv"),
                      ['last_trip_date', 'signup_date'],
                      thresh_dict={
                          "driver": 5,
                          "passenger": 5
                      })
    y = data.pop('churn')
    X = data
    X = X.drop('last_trip_date', axis=1)
    X = X.drop('signup_date', axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)
    clf = tree.DecisionTreeClassifier(criterion='gini',
                                      max_depth=8,
                                      min_samples_leaf=2)
    clf = clf.fit(X_train, y_train)
    feature_names = X_train.columns

示例#10

0

显示文件

文件： plotting.py 项目： chrisfeller/Kaggle_Competitions

                       kde=True,
                       bins=75,
                       ax=axs[0],
                       color='steelblue')
    ax1.set_xlabel('Normal Sale Price')
    ax1.set_ylabel('Frequency', size=12)
    ax2 = sns.distplot(df['SalePrice_Log'],
                       kde=True,
                       bins=75,
                       ax=axs[1],
                       color='steelblue')
    ax2.set_xlabel('Log Transformed Sale Price')
    ax2.set_ylabel('Frequency', size=12)
    plt.tight_layout()
    plt.subplots_adjust(top=0.9)
    plt.show()


if __name__ == '__main__':
    #Load Data
    train, test = load_data()

    # Plot Correlation Matrix of all features
    train = clean_data(train, dummy=False)
    # plot_correlation_matrix(train)
    scatter_matrix(train)

    # Log-Transform SalePrice
    train['SalePrice_Log'] = np.log1p(train["SalePrice"])
    saleprice_dist(train)

示例#11

0

显示文件

文件： boosted_tree.py 项目： Jbudnick/Rideshare_Case_Study

    print("-" * 55)
    for param, vals in parameter_grid.items():
        print("{0:<20s} | {1:<8s} | {2}".format(str(param),
                                                str(best_params[param]),
                                                str(vals)))
    return best_params, model_best


if __name__ == "__main__":
    churn_df = pd.read_csv('data/churn.csv')
    churn_test_df = pd.read_csv('data/churn_test.csv')
    churn_train_df = pd.read_csv('data/churn_train.csv')

    churn_df = clean_data(churn_df, ['last_trip_date', 'signup_date'],
                          thresh_dict={
                              "driver": 5,
                              "passenger": 5
                          })
    churn_test_df = clean_data(churn_test_df,
                               ['last_trip_date', 'signup_date'],
                               thresh_dict={
                                   "driver": 5,
                                   "passenger": 5
                               })
    churn_train_df = clean_data(churn_train_df,
                                ['last_trip_date', 'signup_date'],
                                thresh_dict={
                                    "driver": 5,
                                    "passenger": 5
                                })

示例#12

0

显示文件

文件： randforest.py 项目： Jbudnick/Rideshare_Case_Study

    ax.set_title(f"Feature importances - {type(model).__name__}")
    ax.set_xlabel("Feature", fontsize=16)
    ax.set_ylabel("Feature importance", fontsize=16)
    plt.tight_layout()
    plt.savefig(out_filepath)
    plt.show()
    return


if __name__ == '__main__':

    churn_df = pd.read_csv('data/churn.csv')
    churn_test_df = pd.read_csv('data/churn_test.csv')
    churn_train_df = pd.read_csv('data/churn_train.csv')

    churn_df = clean_data(churn_df, ['last_trip_date', 'signup_date'])
    churn_test_df = clean_data(churn_test_df,
                               ['last_trip_date', 'signup_date'])
    churn_train_df = clean_data(churn_train_df,
                                ['last_trip_date', 'signup_date'])

    churn_train_df.drop(['last_trip_date', 'signup_date'],
                        axis=1,
                        inplace=True)

    churn_test_df.drop(['last_trip_date', 'signup_date'], axis=1, inplace=True)

    y = churn_train_df.pop('churn').values
    X = churn_train_df.values
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80)