예제 #1
0
def train_model(train_vector):
    mapper = sklearn_pandas.DataFrameMapper([
            ('op_type',preprocessing.LabelEncoder()),
            (['is_vip','reg_life','pre7day_uid_ipcount','pre7day_uid_jobcount','pre7day_uid_citycount','pre7day_uid_infocount','pre7day_uid_sys_delete_count','pre7day_uid_man_delete_count','pre7day_uid_sys_backmodify_count','pre7day_uid_man_backmodify_count','license_enterpriseid_count','pre7day_ip_uidcount','pre7day_ip_jobcount','pre7day_ip_citycount','pre7day_ip_infocount','pre7day_ip_sys_delete_count','pre7day_ip_man_delete_count','pre7day_ip_sys_backmodify_count','pre7day_ip_man_backmodify_count','pre7day_userip_reguid_count','pre7day_userip_login_count','pre7day_phone_uidcount','pre7day_phone_jobcount','pre7day_phone_citycount','pre7day_phone_infocount','pre24hour_ip_uid_count','pre24hour_ip_job_count','pre24hour_ip_city_count','pre24hour_ip_info_count','pre24hour_uid_ip_count','pre24hour_uid_job_count','pre24hour_uid_city_count','pre24hour_uid_info_count','pre24hour_phone_uid_count','pre24hour_phone_job_count','pre24hour_phone_city_count','pre24hour_phone_info_count','pre24hour_ip_audit_pass_info_count','pre24hour_ip_audit_nopass_info_count','pre24hour_ip_audit_shuazuan_info_count','pre24hour_uid_audit_pass_info_count','pre24hour_uid_audit_nopass_info_count','pre24hour_uid_audit_shuazuan_info_count','pre24hour_phone_audit_pass_info_count','pre24hour_phone_audit_nopass_info_count','pre24hour_phone_audit_shuazuan_info_count','pre24hour_ip_sys_delete_count','pre24hour_ip_sys_backmodify_count','pre24hour_uid_sys_delete_count','pre24hour_uid_sys_backmodify_count','pre1hour_ip_uid_count','pre1hour_ip_job_count','pre1hour_ip_city_count','pre1hour_ip_info_count','pre1hour_uid_ip_count','pre1hour_uid_job_count','pre1hour_uid_city_count','pre1hour_uid_info_count','pre1hour_phone_uid_count','pre1hour_phone_job_count','pre1hour_phone_city_count','pre1hour_phone_info_count','pre1hour_title_uid_count','pre1hour_title_job_count','pre1hour_title_city_count','pre1hour_title_info_count','pre1hour_title_ip_count','pre5min_ip_uid_count','pre5min_ip_job_count','pre5min_ip_city_count','pre5min_ip_info_count','pre5min_uid_ip_count','pre5min_uid_job_count','pre5min_uid_city_count','pre5min_uid_info_count','pre5min_phone_uid_count','pre5min_phone_job_count','pre5min_phone_city_count','pre5min_phone_info_count','pre1min_ip_uid_count','pre1min_ip_job_count','pre1min_ip_city_count','pre1min_ip_info_count','pre1min_uid_ip_count','pre1min_uid_job_count','pre1min_uid_city_count','pre1min_uid_info_count','pre1min_phone_uid_count','pre1min_phone_job_count','pre1min_phone_city_count','pre1min_phone_info_count'],None),
            (['license'],[preprocessing.Imputer(strategy='most_frequent'),preprocessing.LabelEncoder()]),
            (['xingzhi'],[preprocessing.Imputer(strategy='most_frequent'),preprocessing.LabelEncoder()]),
            (['xinzi'],[preprocessing.Imputer(strategy='most_frequent'),preprocessing.LabelEncoder()]),    
            (['xueli'],[preprocessing.Imputer(strategy='most_frequent'),preprocessing.LabelEncoder()]),
            (['thirdcertificate'],[preprocessing.Imputer(strategy='most_frequent'),preprocessing.LabelEncoder()]),
            (['zhaopinrenshu'],preprocessing.Imputer(strategy='most_frequent')),
            (['gongzuonianxian'],[preprocessing.Imputer(strategy='most_frequent'),preprocessing.LabelEncoder()]),
            (['fulidaiyu_wuxian','fulidaiyu_canbu','fulidaiyu_huabu','fulidaiyu_fangbu','fulidaiyu_jiaotongbu','fulidaiyu_zhoumoshuangxiu','fulidaiyu_jiabanbu','fulidaiyu_baozhu','fulidaiyu_niandishuangxin','fulidaiyu_baochi','fulidaiyu_oversum','user_define_fulidaiyu_oversum','title_punctuationcount','title_rarecharcount','title_iscontainabnormalnumber','content_punctuationcount','content_suspectmaxlength','content_conpuncmaxsize','content_suspectzerocount','content_transitsum','content_transitrate','content_punctuationrate','content_rarecharcount','fuli_suspectmaxlength','fuli_suspectzerocount','fuli_rarecharcount','fuli_iscontainkeyword','fuli_iscontainabnormalnumber','enterprisereg_name_rarecharcount','enterprisereg_address_rarecharcount','enterprisereg_address_iscontain_local'],None),
            ('target',None)
    ])
    train = mapper.fit_transform(train_vector)
    pipeline_estimator = pipeline.Pipeline([
        ('estimator',RandomForestClassifier(n_estimators=300, max_depth=10, min_samples_split=2, n_jobs=3 ,random_state=0))
    ])
    #pipeline_params = dict(
        #selector__k=[90, 100, 110],
        #estimator__n_estimators=[200, 250, 300])
    #grid_search = model_selection.GridSearchCV(pipeline_estimator, param_grid=pipeline_params, n_jobs=3)
    #grid_search.fit(train[:,:-1],train[:,-1])
    #best_estimator = grid_search.best_estimator_
    pipeline_estimator.fit(train[:,:-1],train[:,-1])
    return pipeline_estimator,mapper
    def load_testset(self, shuffle):

        print("<==== ====", inspect.stack()[0][3], "==== ====>")

        df = pd.read_csv("competitionset.csv")

        continuous_features = ["1", "2", "6", "8", "10"]
        categorical_features = [
            "3", "4", "5", "7", "9", "11", "12", "13", "14", "15", "16", "17",
            "18"
        ]

        col_list = list(df.columns)
        col_list.remove('rowIndex')
        col_list.insert(0, 'rowIndex')
        df = df[col_list]

        df = pd.get_dummies(
            df,
            columns=["feature" + n for n in categorical_features],
            dtype=np.int64)

        transform_mapper = sklearn_pandas.DataFrameMapper(
            [
                ('rowIndex', None),
            ],
            default=sklearn.preprocessing.StandardScaler())
        standardized = transform_mapper.fit_transform(df.copy())
        df = pd.DataFrame(standardized, columns=df.columns)

        print("0. Prepare the Final Data Sets (Regression)")
        self.TESTSET_X = df.drop(['rowIndex'], axis=1)
예제 #3
0
    def _gen_mapper(self):
        '''create a list of tuples for the DataFrameMapper for the moving window.'''
        window_mappings = [(self.target, None)]  # keep the target untouched
        for k in self.mappings.keys():
            for i in reversed(range(self.obs_window)):
                name = "{}_{}".format(k, i)
                window_mappings.append(([name], self.mappings[k]))

        self._mapper = sklearn_pandas.DataFrameMapper(window_mappings)
예제 #4
0
def extract_personal_history(personal_history_frame):
    personal_history_mapper = sklearn_pandas.DataFrameMapper([
        (['N103_a_Smoking (Present or Past)', 'N107_Tobacco Chewing',
          'N108_Alcohol Intake'],
         [sklearn.preprocessing.FunctionTransformer(impute2,validate=False), # Impute no
                                sklearn.preprocessing.FunctionTransformer(np.negative),
                                sklearn.preprocessing.Binarizer(threshold = -1.5)])]) # Flip order so 0 is no, 1 is yes
    # TODO include type of smoking, start date, stop date, number per day

    x = personal_history_mapper.fit_transform(personal_history_frame.copy())
    feature_names = ['has_smoked','has_chewed_tobacco','has_drunk_alcohol']
    x = pd.DataFrame(data=x,index = personal_history_frame.index,columns = feature_names)
    return x
예제 #5
0
def extract_current_symptoms(symptom_frame):
    
    # TODO Extract information other than just presence or absence of symptoms
    breathlessness_mapper = sklearn_pandas.DataFrameMapper([
        (['N18_Breathlessness'], [sklearn.preprocessing.FunctionTransformer(impute2,validate=False), # Impute no
                                sklearn.preprocessing.FunctionTransformer(np.negative),
                                sklearn.preprocessing.Binarizer(threshold = -1.5)])]) # Flip order so 0 is no, 1 is yes
    cough_mapper = sklearn_pandas.DataFrameMapper([
        (['N30_Cough'], [sklearn.preprocessing.FunctionTransformer(impute2,validate=False), # Impute no
                                sklearn.preprocessing.FunctionTransformer(np.negative),
                                sklearn.preprocessing.Binarizer(threshold = -1.5)])]) # Flip order so 0 is no, 1 is yes
    chest_pain_mapper = sklearn_pandas.DataFrameMapper([
        (['N50_Chest Pain'], [sklearn.preprocessing.FunctionTransformer(impute2,validate=False), # Impute no
                                sklearn.preprocessing.FunctionTransformer(np.negative),
                                sklearn.preprocessing.Binarizer(threshold = -1.5)])]) # Flip order so 0 is no, 1 is yes
    fever_mapper = sklearn_pandas.DataFrameMapper([
        (['N55_Fever'], [sklearn.preprocessing.FunctionTransformer(impute2,validate=False), # Impute no
                                sklearn.preprocessing.FunctionTransformer(np.negative),
                                sklearn.preprocessing.Binarizer(threshold = -1.5)])]) # Flip order so 0 is no, 1 is yes
    nasal_mapper = sklearn_pandas.DataFrameMapper([
        (['N64_Nasal Symptoms'], [sklearn.preprocessing.FunctionTransformer(impute2,validate=False), # Impute no
                                sklearn.preprocessing.FunctionTransformer(np.negative),
                                sklearn.preprocessing.Binarizer(threshold = -1.5)])]) # Flip order so 0 is no, 1 is yes

    x_breathlessness = breathlessness_mapper.fit_transform(symptom_frame.copy())
    x_cough = cough_mapper.fit_transform(symptom_frame.copy())
    x_chest_pain = chest_pain_mapper.fit_transform(symptom_frame.copy())
    x_fever = fever_mapper.fit_transform(symptom_frame.copy())
    x_nasal = nasal_mapper.fit_transform(symptom_frame.copy())


    x = pd.DataFrame(index=symptom_frame.index)
    x['has_breathlessness'] = x_breathlessness
    x['has_cough'] = x_cough
    x['has_chest_pain'] = x_chest_pain
    x['has_fever'] = x_fever
    x['has_nasal_symptoms'] = x_nasal
    return x
예제 #6
0
def extract_risk_factors(risk_factor_frame):
    risk_factor_mapper = sklearn_pandas.DataFrameMapper([
        (['N83_Family history of COPD', 'N84_Family hostory of allergies',
          'N86_Personal History of allergies?','N87_Indoor cooking using Biomass?'],
         [sklearn.preprocessing.FunctionTransformer(impute2,validate=False), # Impute no
                                sklearn.preprocessing.FunctionTransformer(np.negative),
                                sklearn.preprocessing.Binarizer(threshold = -1.5)])]) # Flip order so 0 is no, 1 is yes
    # TODO include type of allergies, number of hours per day cooking, years cooking
    
    x_risk_factor = risk_factor_mapper.fit_transform(risk_factor_frame.copy())
    feature_names = ['has_copd_family_history','has_allergy_family_history','has_allergy_personal_history',
                     'has_biomass_cooking_history']
    x = pd.DataFrame(data=x_risk_factor,index = risk_factor_frame.index,columns = feature_names)
    return x
예제 #7
0
def main():
    data = MNIST('./data')
    col_names = ["x" + str(x) for x in range(784)]
    # Define a transform function that will be serialized with the model
    mnist_mapper = sklearn_pandas.DataFrameMapper([(col_names,
                                                    StandardScaler()),
                                                   ("digit", None)])

    # 60,000 train samples of 28x28 grid, domain 0-255
    mnist_train_data, mnist_train_label = data.load_training()
    mnist_train_df = pandas.concat(
        (pandas.DataFrame(mnist_train_data, columns=col_names),
         pandas.DataFrame(list(mnist_train_label), columns=["digit"])),
        axis=1)
    mnist_train_df_norm = mnist_mapper.fit_transform(mnist_train_df)

    mlp_config = {
        'hidden_layer_sizes': (1000, ),
        'activation': 'tanh',
        'algorithm': 'adam',
        'max_iter': 20,
        'early_stopping': True,
        'validation_fraction': 0.1,
        'verbose': True
    }
    mnist_classifier = nn.MLPClassifier(**mlp_config)
    mnist_classifier.fit(X=mnist_train_df_norm[:, 0:28 * 28],
                         y=mnist_train_df_norm[:, 28 * 28])

    # 10,000 test samples
    mnist_test_data, mnist_test_label = data.load_testing()
    mnist_test_df = pandas.concat(
        (pandas.DataFrame(mnist_test_data, columns=col_names),
         pandas.DataFrame(list(mnist_test_label), columns=["digit"])),
        axis=1)
    mnist_test_df_norm = mnist_mapper.fit_transform(mnist_test_df)

    prediction = mnist_classifier.predict_proba(mnist_test_df_norm[:,
                                                                   0:28 * 28])
    truth_array = [
        prediction[idx].argmax() == mnist_test_label[idx]
        for idx in range(len(prediction))
    ]
    accuracy = float(sum(truth_array)) / float(len(truth_array))
    print "out of sample model accuracy [%s]" % accuracy
    print "serializing to pmml"
    sklearn2pmml(mnist_classifier,
                 mnist_mapper,
                 "MLP_MNIST.pmml",
                 with_repr=True)
예제 #8
0
    def load_trainingset(self, shuffle):
        print("<==== ====", inspect.stack()[0][3], "==== ====>")

        df = pd.read_csv("trainingset.csv")

        continuous_features = ["1", "2", "6", "8", "10"]
        categorical_features = ["3", "4", "5", "7", "9", "11", "12",
                                "13", "14", "15", "16", "17", "18"]

        df['Claimed'] = np.where(df['ClaimAmount'] > 0, 1, 0)

        col_list = list(df.columns)
        col_list.remove('rowIndex')
        col_list.remove('Claimed')
        col_list.remove('ClaimAmount')
        col_list.insert(0, 'ClaimAmount')
        col_list.insert(0, 'Claimed')
        col_list.insert(0, 'rowIndex')
        df = df[col_list]

        df = pd.get_dummies(
            df, columns=["feature" + n for n in categorical_features],
            dtype=np.int64
        )

        transform_mapper = sklearn_pandas.DataFrameMapper([
            ('rowIndex', None),
            ('Claimed', None),
            ('ClaimAmount', None),
        ], default=sklearn.preprocessing.StandardScaler())
        standardized = transform_mapper.fit_transform(df.copy())
        df = pd.DataFrame(standardized, columns=df.columns)





        print("0. Prepare the Final Data Sets (Classification)")
        self.c_X = df.drop(['rowIndex', 'Claimed', 'ClaimAmount'], axis=1)
        self.c_Y = df.Claimed
        self.c_x_train, self.c_x_test, self.c_y_train, self.c_y_test = sklearn.model_selection\
            .train_test_split(self.c_X, self.c_Y, test_size=0.30, shuffle=shuffle)

        print("0. Prepare the Final Data Sets (Regression)")
        self.r_X = df.drop(['rowIndex', 'Claimed', 'ClaimAmount'], axis=1)
        self.r_Y = df.ClaimAmount
        self.r_x_train, self.r_x_test, self.r_y_train, self.r_y_test = sklearn.model_selection\
            .train_test_split(self.r_X, self.r_Y, test_size=0.30, shuffle=shuffle)
    def load_trainingset(self, shuffle, PARAMETER):
        self.log(
            "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@:" +
            str(PARAMETER))
        print("<==== ====", inspect.stack()[0][3], "==== ====>")

        df = pd.read_csv("trainingset.csv")

        continuous_features = ["1", "2", "6", "8", "10"]
        categorical_features = [
            "3", "4", "5", "7", "9", "11", "12", "13", "14", "15", "16", "17",
            "18"
        ]

        df['Claimed'] = np.where(df['ClaimAmount'] > 0, 1, 0)
        df['Outlier'] = np.where(df['ClaimAmount'] > PARAMETER, 1, 0)

        col_list = list(df.columns)
        col_list.remove('rowIndex')
        col_list.remove('Claimed')
        col_list.remove('ClaimAmount')
        col_list.remove('Outlier')
        col_list.insert(0, 'Outlier')
        col_list.insert(0, 'ClaimAmount')
        col_list.insert(0, 'Claimed')
        col_list.insert(0, 'rowIndex')
        df = df[col_list]

        df = pd.get_dummies(
            df,
            columns=["feature" + n for n in categorical_features],
            dtype=np.int64)

        transform_mapper = sklearn_pandas.DataFrameMapper(
            [
                ('rowIndex', None),
                ('Claimed', None),
                ('ClaimAmount', None),
                ('Outlier', None),
            ],
            default=sklearn.preprocessing.StandardScaler())
        standardized = transform_mapper.fit_transform(df.copy())
        df = pd.DataFrame(standardized, columns=df.columns)

        print("0. Prepare the Final Data Sets (Classification)")
        self.c_X = df.drop(['rowIndex', 'Claimed', 'ClaimAmount', 'Outlier'],
                           axis=1)
        self.c_Y = df.Claimed

        # <Polynomial Features>
        # poly = sklearn.preprocessing.PolynomialFeatures(2, include_bias=True)
        # self.c_X = poly.fit_transform(self.c_X)

        # <Power Transformer>
        # power = sklearn.preprocessing.PowerTransformer()
        # power.fit(self.c_X)
        # self.c_X = power.transform(self.c_X)

        # <Quantile Transform>
        # self.c_X = sklearn.preprocessing.quantile_transform(self.c_X, axis=0, n_quantiles=1000,
        #         output_distribution='normal', ignore_implicit_zeros=False,
        #         subsample=100000, random_state=None, copy=False)



        self.c_x_train, self.c_x_test, self.c_y_train, self.c_y_test = sklearn.model_selection\
            .train_test_split(self.c_X, self.c_Y, test_size=0.30, shuffle=shuffle)

        # self.c_x_train = self.c_x_train.values
        # self.c_x_test = self.c_x_test.values
        # self.c_y_train = self.c_y_train.values
        # self.c_y_test = self.c_y_test.values
        #
        # print("0. SMOTE")
        # self.c_x_train_SMOTE, self.c_y_train_SMOTE = SMOTE().fit_resample(self.c_x_train, self.c_y_train)
        #
        # print("0. ADASYN")
        # self.c_x_train_ADASYN, self.c_y_train_ADASYN = ADASYN().fit_resample(self.c_x_train, self.c_y_train)

        print("0. Prepare the Final Data Sets (Regression)")
        self.r_X = df.drop(['rowIndex', 'Claimed', 'ClaimAmount', 'Outlier'],
                           axis=1)
        self.r_Y = df.ClaimAmount
        self.r_x_train, self.r_x_test, self.r_y_train, self.r_y_test = sklearn.model_selection\
            .train_test_split(self.r_X, self.r_Y, test_size=0.30, shuffle=shuffle)

        print("0. Prepare the Final Data Sets (Outlier)")
        self.o_X = df.drop(['rowIndex', 'Claimed', 'ClaimAmount', 'Outlier'],
                           axis=1)
        self.o_Y = df.Outlier
        self.o_x_train, self.o_x_test, self.o_y_train, self.o_y_test = sklearn.model_selection\
            .train_test_split(self.o_X, self.o_Y, test_size=0.30, shuffle=shuffle)

        print("0. Aggressive Regression")
        df_aggressive_regression = df[:int(0.7 * df.shape[0])]
        df_aggressive_regression = df_aggressive_regression[
            df_aggressive_regression['ClaimAmount'] > 0]

        print(df_aggressive_regression.shape)
        OUTLIER_CUTOFF = 4647
        df_aggressive_regression = df_aggressive_regression[
            df_aggressive_regression['ClaimAmount'] < OUTLIER_CUTOFF]

        self.r_x_train_aggressive = df_aggressive_regression.drop(
            ['rowIndex', 'Claimed', 'ClaimAmount'], axis=1)
        self.r_y_train_aggressive = df_aggressive_regression.ClaimAmount
예제 #10
0
data = []
targets = []
for doc in docs:
    doc_data = doc['fields']['analyzed_text'][0]
    if (len(doc_data) > 0):
        text = ''
        for word in doc_data:
            text = text + ' ' + word
        data.append(text)
        if (doc['fields']['label'][0] == 'negative'):
            targets.append(0)
        else:
            targets.append(1)
    else:
        print 'found empty doc'

print len(data)
mapper4 = sklearn_pandas.DataFrameMapper([
    ('text', sklearn.feature_extraction.text.CountVectorizer()),
],
                                         sparse=True)
dataframe = pandas.DataFrame({'text': data})
matrix = mapper4.fit_transform(dataframe)
clf = sklearn.linear_model.LogisticRegression()
clf.fit(matrix, targets)
print matrix[2:3]
print(clf.predict(matrix[2:3]))
print(targets[2])
# this does not work because of https://github.com/jpmml/jpmml-sklearn/issues/4
sklearn2pmml.sklearn2pmml(clf, mapper4, "naive_bayes.pmml", with_repr=True)
예제 #11
0
    "c4": num,
    "s5": cat,
    "c5": num,
    "hand": cat
}
d_in = pd.read_csv("..\\data\\poker-hand-training-true.data",
                   names=col_names,
                   dtype=col_types)

# features can't be parallelly processed by different transformer in a single DataFrameMapper pipeline object
# pipeline1 also passes through 'y' (comes back as the 3rd column)
engineered_feature_pipeline1 = skp.DataFrameMapper(
    [(['s1', 's2', 's3', 's4', 's5'], uf.Comparator(criteria=5), {
        'alias': 'suit_match'
    }),
     (['c1', 'c2', 'c3', 'c4', 'c5'], uf.Comparator(criteria=2), {
         'alias': 'no_pairs'
     })],
    input_df=True,
    df_out=True,
    default=None)

#temp = d_in[d_in['hand']=='5']
#engineered_feature_pipeline1.fit_transform(temp).head()

engineered_feature_pipeline2 = skp.DataFrameMapper(
    [(['c1', 'c2', 'c3', 'c4', 'c5'], uf.Comparator(criteria=3), {
        'alias': 'has_triplet'
    })],
    input_df=True,
    df_out=True,
    default=False)
예제 #12
0
def make_mapper_from_transformations(transformations):
    return sklearn_pandas.DataFrameMapper(
        [t.as_input_transformer_tuple() for t in transformations],
        input_df=True)
    def load_trainingset(self, shuffle):
        print("<==== ====", inspect.stack()[0][3], "==== ====>")

        df = pd.read_csv("trainingset.csv")

        continuous_features = ["1", "2", "6", "8", "10"]
        categorical_features = [
            "3", "4", "5", "7", "9", "11", "12", "13", "14", "15", "16", "17",
            "18"
        ]

        df['Claimed'] = np.where(df['ClaimAmount'] > 0, 1, 0)

        col_list = list(df.columns)
        col_list.remove('rowIndex')
        col_list.remove('Claimed')
        col_list.remove('ClaimAmount')
        col_list.insert(0, 'ClaimAmount')
        col_list.insert(0, 'Claimed')
        col_list.insert(0, 'rowIndex')
        df = df[col_list]

        df = pd.get_dummies(
            df,
            columns=["feature" + n for n in categorical_features],
            dtype=np.int64)

        transform_mapper = sklearn_pandas.DataFrameMapper(
            [
                ('rowIndex', None),
                ('Claimed', None),
                ('ClaimAmount', None),
            ],
            default=sklearn.preprocessing.StandardScaler())
        standardized = transform_mapper.fit_transform(df.copy())
        df = pd.DataFrame(standardized, columns=df.columns)

        print("0. Prepare the Final Data Sets (Classification)")
        self.c_X = df.drop(['rowIndex', 'Claimed', 'ClaimAmount'], axis=1)
        self.c_Y = df.Claimed

        # <Polynomial Features>
        # poly = sklearn.preprocessing.PolynomialFeatures(2, include_bias=True)
        # self.c_X = poly.fit_transform(self.c_X)

        # <Power Transformer>
        # power = sklearn.preprocessing.PowerTransformer()
        # power.fit(self.c_X)
        # self.c_X = power.transform(self.c_X)

        # <Quantile Transform>
        # self.c_X = sklearn.preprocessing.quantile_transform(self.c_X, axis=0, n_quantiles=1000,
        #         output_distribution='uniform', ignore_implicit_zeros=False,
        #         subsample=100000, random_state=None, copy=False)

        # <PCA>
        # pca = sklearn.decomposition.PCA(n_components=9, copy=True, whiten=False,
        #                                 svd_solver='auto', tol=0.0, iterated_power='auto',
        #                                 random_state=None)
        # pca = sklearn.decomposition.TruncatedSVD(n_components=300, algorithm='randomized',
        #                                          n_iter=100, random_state=None, tol=0.0)
        # pca.fit(self.c_X)
        # self.c_X = pca.transform(self.c_X)





        self.c_x_train, self.c_x_test, self.c_y_train, self.c_y_test = sklearn.model_selection\
            .train_test_split(self.c_X, self.c_Y, test_size=0.30, shuffle=shuffle)

        print("0. Prepare the Final Data Sets (Regression)")
        self.r_X = df.drop(['rowIndex', 'Claimed', 'ClaimAmount'], axis=1)
        self.r_Y = df.ClaimAmount
        self.r_x_train, self.r_x_test, self.r_y_train, self.r_y_test = sklearn.model_selection\
            .train_test_split(self.r_X, self.r_Y, test_size=0.30, shuffle=shuffle)

        print("0. Aggressive Regression")
        df_aggressive_regression = df[:int(0.7 * df.shape[0])]
        df_aggressive_regression = df_aggressive_regression[
            df_aggressive_regression['ClaimAmount'] > 0]

        print(df_aggressive_regression.shape)
        OUTLIER_CUTOFF = 4647
        df_aggressive_regression = df_aggressive_regression[
            df_aggressive_regression['ClaimAmount'] < OUTLIER_CUTOFF]

        self.r_x_train_aggressive = df_aggressive_regression.drop(
            ['rowIndex', 'Claimed', 'ClaimAmount'], axis=1)
        self.r_y_train_aggressive = df_aggressive_regression.ClaimAmount

        print(df_aggressive_regression.shape)
예제 #14
0
    def step(self):
        import pandas as pd
        import sklearn.model_selection as model_selection
        import sklearn as skl
        import sklearn_pandas as skp
        import hpogrid
        import glob

        if self.run_mode == "grid":
            data_directory = hpogrid.get_datadir()
            print("inDS dir: {}".format(data_directory))
            data_files = glob.glob(data_directory + "*.csv")
            print("--> inDS file: {}".format(data_files))
            for f in data_files:
                all_data = pd.read_csv(f)
        elif self.run_mode == "local":
            all_data = pd.read_csv(
                '/afs/cern.ch/work/s/ssevova/public/dark-photon-atlas/plotting/trees/v08/tight-and-ph-skim/mc16d/dataLists/all_data'
            )

        # Remove negative weights for training
        all_data = all_data[all_data['w'] > 0]

        # Load the data & split by train/test
        X = all_data
        y = all_data['event']
        X_train, X_test, y_train, y_test = model_selection.train_test_split(
            X, y, test_size=0.3, random_state=0)
        X_train = X_train[self.varw]
        X_test = X_test[self.varw]
        cols = X_train.columns
        itrain = X_train.index
        itest = X_test.index
        wtest_unscaled = X_test['w']

        # Scaling
        mapper = skp.DataFrameMapper([(cols,
                                       skl.preprocessing.StandardScaler())])
        scaled_train = mapper.fit_transform(X_train.copy(), len(cols))
        scaled_test = mapper.fit_transform(X_test.copy(), len(cols))
        X_scaled_train = pd.DataFrame(scaled_train, index=itrain, columns=cols)
        X_scaled_test = pd.DataFrame(scaled_test, index=itest, columns=cols)
        wtest = X_scaled_test['w']
        wtrain = X_scaled_train['w']

        # Deal with weights
        self.varw.remove("w")
        X_train = X_scaled_train[self.varw]
        X_test = X_scaled_test[self.varw]

        history = self.model.fit(X_train,
                                 y_train,
                                 epochs=self.epochs,
                                 batch_size=self.batchsize,
                                 validation_data=(X_test, y_test))

        train_loss, train_acc = self.model.evaluate(X_train,
                                                    y_train,
                                                    verbose=2)
        test_loss, test_acc = self.model.evaluate(X_test, y_test, verbose=2)

        probs = self.model.predict(X_test)
        predictions = self.model.predict_classes(X_test)
        fpr, tpr, threshold = skl.metrics.roc_curve(
            y_test, probs, sample_weight=wtest_unscaled)
        auc = skl.metrics.auc(fpr, tpr)

        # It is important to return tf.Tensors as numpy objects.
        return {
            "epoch": self.iteration,
            "loss": train_loss,
            "accuracy": train_acc,
            "auc": auc,
            "test_loss": test_loss,
            "test_accuracy": test_acc
        }
예제 #15
0
# -*- coding: utf-8 -*-

import pandas as pd
import sklearn_pandas
import sklearn2pmml
from sklearn import preprocessing
from sklearn import svm
from sklearn import pipeline

xxx = []
data = pd.read_csv("./01_16_ip_1_feature.txt")
mapper = sklearn_pandas.DataFrameMapper([([i], preprocessing.StandardScaler())
                                         for i in data.columns])
train = mapper.fit_transform(data)

pipeline_estimator = pipeline.Pipeline([('estimator',
                                         svm.OneClassSVM(nu=0.015,
                                                         kernel="rbf",
                                                         gamma=0.04))])
pipeline_estimator.fit(train)
data1 = pd.read_csv("./17_ip_1_feature.txt")
train1 = mapper.transform(data1)
pred_test = pipeline_estimator.predict(train1)
for i in range(len(pred_test)):
    if pred_test[i] != 1:
        xxx.append(i)

#xxx = pred_test[pred_test == -1]

#sklearn2pmml.sklearn2pmml(pipeline_estimator,mapper,"./track_model.pmml", with_repr=True,debug=True)
예제 #16
0
        for i in range(X.shape[0]):
            x = url_regex.sub(r'\g<1>\g<3>', X[i])
            x = media_regex.sub(r'\g<1>\g<3>', x)
            result[i] = x
        return result


if __name__ == '__main__':
    df = pd.DataFrame({'bio': ["""
i am just %URL% copy pasting my answer to an already answered question
Shubham %URL% Bhardwaj's answer to What life lessons are counter-intuitive or go against common sense or wisdom?
%URL%
%MEDIA%
ACTIONS %MEDIA% LIE LOUDER THAN WORDS:
I have grown up listening to “ACTIONS SPEAK LOUDER THAN WORDS” whole my life. 

""", """
i am just copy pasting my answer to an already answered question
%MEDIA%
%MEDIA%
        """, ""]})

    extractor = UrlAndMediaTextExtractor()
    stripper = UrlAndMediaTextStripper()
    mapper = sklearn_pandas.DataFrameMapper([
        ('bio', extractor),
        ('bio', stripper, {'alias': 'bio_stripped'})
    ])
    print(mapper.transform(df))
    print(mapper.transformed_names_)