Пример #1
0
def train_rf(gt_folder, save_path, n_trees=150, max_depth=10, n_threads=4):
    gt_files = glob(os.path.join(gt_folder, '*.h5'))

    feat_names = None
    x, y = [], []
    for path in gt_files:
        feats, labs, this_feat_names = load_features_and_labels(path)

        if feat_names is None:
            feat_names = this_feat_names
        else:
            assert feat_names == this_feat_names

        x.append(feats)
        y.append(labs)

    x = np.concatenate(x, axis=0)
    y = np.concatenate(y, axis=0)
    assert len(x) == len(y)

    rf = RandomForestClassifier(n_estimators=n_trees,
                                n_jobs=n_threads,
                                max_depth=max_depth)
    rf.fit(x, y)
    # monkey patch the feature names, so we can validate this later
    rf.feature_names = feat_names
    with open(save_path, 'wb') as f:
        pickle.dump(rf, f)
Пример #2
0
def train():
    query = '''SELECT
                    webapp_platform.platform AS Platform,
                    webapp_modules.module AS Module,
                    webapp_designation.designation AS EmployeeDesignation
                FROM
                    webapp_designation
                    JOIN webapp_employeemodules ON webapp_employeemodules.designation_id = webapp_designation.id
                    JOIN webapp_employeemodules_module ON webapp_employeemodules_module.employeemodules_id = webapp_employeemodules.id
                    JOIN webapp_employeemodules_platform ON webapp_employeemodules_platform.employeemodules_id = webapp_employeemodules_module.employeemodules_id
                    JOIN webapp_platform ON webapp_platform.id = webapp_employeemodules_platform.platform_id
                    JOIN webapp_modules ON webapp_modules.id = webapp_employeemodules_module.modules_id
                ORDER BY EmployeeDesignation, Platform'''
    train_data_df = pd.read_sql_query(query,connection)
    Platforms_df = {}
    platforms = train_data_df.Platform.unique().tolist()
    for platform in platforms:
        platform_df = train_data_df[train_data_df['Platform']== platform]
        Platforms_df[platform] = platform_df

    encoded_Platforms_df = {}
    for platform, platform_df in Platforms_df.items():
        encoded_Platforms_df[platform] = pd.get_dummies(platform_df,columns=['Module'], prefix="",prefix_sep="")
    feature_inputs = {}
    for platform, encoded_df in encoded_Platforms_df.items():
        feature_inputs[platform] = Platforms_df[platform].Module.unique().tolist()
    X = {}
    for platform, feature in feature_inputs.items():
        X[platform] = encoded_Platforms_df[platform][feature_inputs[platform]]
    y = {}
    for platform, feature in feature_inputs.items():
        y[platform] = encoded_Platforms_df[platform].EmployeeDesignation

    data_train = {}
    label_train = {}
    for platform, feature in X.items():
        data_train[platform] = X[platform]
        
    for platform, labels in y.items():
        label_train[platform] = y[platform]

    # # save decision tree model
    # for platform in feature_inputs.keys():
    #     dt = DecisionTreeClassifier()
    #     dt.fit(data_train[platform], label_train[platform])
    #     dt.feature_names = feature_inputs[platform]
    #     model_name = platform + '.sav'
    #     path = os.path.join(settings.BASE_DIR, 'models/dts', model_name)
    #     pickle.dump(dt, open(path, 'wb'))

    # # Save Support Vector Model
    # for platform in feature_inputs.keys():
    #     svc = SVC(gamma='auto',probability=True)
    #     svc.fit(data_train[platform], label_train[platform])
    #     svc.feature_names = feature_inputs[platform]
    #     model_name = platform + '.sav'
    #     path = os.path.join(settings.BASE_DIR, 'models/svcs', model_name)
    #     pickle.dump(svc, open(path, 'wb'))

    # Save Random Forest Model
    for platform in feature_inputs.keys():
        rfc = RandomForestClassifier(n_estimators=100)
        rfc.fit(data_train[platform], label_train[platform])
        rfc.feature_names = feature_inputs[platform]
        model_name = platform + '.sav'
        path = os.path.join(settings.BASE_DIR, 'models/rfcs', model_name)
        pickle.dump(rfc, open(path, 'wb'))