Пример #1
0
        data.append(items)

data = np.array(data)

# Convert string data to numerical data
# 숫자가 아닌 특징을 숫자형태로 인코딩하는 것.
label_encoder = []
X_encoded = np.empty(
    data.shape
)  # => data의 형태 처럼 만들기 print(data.shape) 해보기 => 아마도 (시간,경기유무) 일듯?

for i, item in enumerate(data[0]):  # => enumerate는 i에는 인덱스 item에는 값이 들어간다.
    if item.isdigit():
        X_encoded[:, i] = data[:, i]
    else:
        label_encoder.append(preprocessing.LabelEncoder())
        X_encoded[:, i] = label_encoder[-1].fit_transform(data[:, i])

X = X_encoded[:, :-1].astype(int)  # input으로 들어갈 특징들
y = X_encoded[:, -1].astype(int)  # 오토바이 갯수

# Split data into training and testing datasets
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    X, y, test_size=0.25, random_state=5)

# Extremely Random Forests regressor
params = {'n_estimators': 100, 'max_depth': 4, 'random_state': 0}
regressor = ExtraTreesRegressor(**params)
regressor.fit(X_train, y_train)

# Compute the regressor performance on test data
Пример #2
0
           Y_data,
           eval_set=[(X_test, y_test)],
           eval_metric='l1',
           early_stopping_rounds=1511)
y_lgbmx1 = lgbmx1.predict(X_test)
trainrms = sqrt(mean_squared_error(y_test, y_lgbmx1))
print("lgbmreg trainrms {}".format(trainrms))

# In[75]:
from sklearn import utils

X_Cdata, X_Ctest, Y_Cdata, Y_Ctest = train_test_split(data,
                                                      Y,
                                                      test_size=0.20,
                                                      random_state=42)
lab_enc = preprocessing.LabelEncoder()

lab_enc.fit(Y)
Y_FULL_encoded = lab_enc.transform(Y)
Y_data_encoded = lab_enc.transform(Y_Cdata)
Y_test_encoded = lab_enc.transform(Y_Ctest)
Y_data_encoded.shape

print(utils.multiclass.type_of_target(Y_data_encoded))

lgbmClass1 = lgb.LGBMClassifier(n_estimators=171,
                                num_threads=6,
                                objective='multiclassova')

lgbmClass1.fit(data,
               Y_FULL_encoded,
Пример #3
0

Pclass1_mean_fare = test_data['Fare'].groupby(
    by=test_data['Pclass']).mean().get([1]).values[0]
Pclass2_mean_fare = test_data['Fare'].groupby(
    by=test_data['Pclass']).mean().get([2]).values[0]
Pclass3_mean_fare = test_data['Fare'].groupby(
    by=test_data['Pclass']).mean().get([3]).values[0]

# 建立Pclass_Fare Category
test_data['Pclass_Fare_Category'] = test_data.apply(pclass_fare_category,
                                                    args=(Pclass1_mean_fare,
                                                          Pclass2_mean_fare,
                                                          Pclass3_mean_fare),
                                                    axis=1)
pclass_level = preprocessing.LabelEncoder()  # 给每一项添加标签
pclass_level.fit(
    np.array([
        'Pclass1_Low', 'Pclass1_High', 'Pclass2_Low', 'Pclass2_High',
        'Pclass3_Low', 'Pclass3_High'
    ]))
# 转换成数值
test_data['Pclass_Fare_Category'] = pclass_level.transform(
    test_data['Pclass_Fare_Category'])
# dummy 转换
pclass_dummies_df = pd.get_dummies(test_data['Pclass_Fare_Category']).rename(
    columns=lambda x: 'Pclass_' + str(x))
test_data = pd.concat([test_data, pclass_dummies_df], axis=1)
# 将 Pclass 特征factorize化:
test_data['Pclass'] = pd.factorize(test_data['Pclass'])[0]
Пример #4
0
def read_xy(PATH):
    dataset = pd.read_csv(PATH)  #用pandas读取原始数据
    col = dataset.columns.values.tolist()  #取第一行
    col1 = col[1:]  #取特征
    print(len(col1))  #特征维数
    X_train = np.array(dataset[col1])  #取数据
    y_train = preprocessing.LabelEncoder().fit_transform(
        dataset['class'])  #标签标准化
    print(len(y_train))
    #标准化
    scale = StandardScaler().fit(
        X_train)  #特征矩阵标准化(与距离计算无关的概率模型、与距离计算无关的基于树的模型不需要)
    X_train = scale.transform(X_train)

    #带L1/L2/L1+L2惩罚项的逻辑回归作为基模型的特征选择——SelectFromModel
    #小的C会导致少的特征被选择。使用Lasso,alpha的值越大,越少的特征会被选择。
    ######################################针对clf.coef_:1*n_features#####################################
    '''
	#clf=Lasso(normalize=True,alpha=0.001,max_iter=5000,random_state=0)#Lasso回归
	#clf = LassoCV()
	#clf=Ridge(normalize=True,alpha=0.001,max_iter=5000,random_state=0)#岭回归
	#clf=ElasticNet(normalize=True,alpha=0.001,l1_ratio=0.1,max_iter=5000,random_state=0)#弹性网络正则
	clf=LinearRegression(normalize=True)
	clf.fit(X_train, y_train)
	#print(clf.coef_)
	importance=np.abs(clf.coef_)
	#print(importance)
	'''
    ######################################针对clf.coef_:n_classes*n_features#####################################

    #‘newton-cg’,‘sag’和‘lbfgs’等solvers仅支持‘L2’regularization,
    #‘liblinear’ solver同时支持‘L1’、‘L2’regularization,
    #若dual=Ture,则仅支持L2 penalty。
    clf = LogisticRegression(penalty='l1',
                             C=0.1,
                             solver='liblinear',
                             random_state=0)  #clf.coef_:n_classes*n_features
    #clf=LogisticRegression(penalty='l2',C=0.1,random_state=0)
    #clf=LR(threshold=0.5, C=0.1)#参数threshold为权值系数之差的阈值
    #clf=LinearSVC(penalty='l1',C=0.1,dual=False,random_state=0)
    #clf=LinearSVC(penalty='l2',C=0.1,random_state=0)
    clf.fit(X_train, y_train)
    #print(clf.coef_)
    #每个类别--每个属性--都有一个权重,将不同类别同一属性权重相加--即为该维度的--重要程度得分
    #方法一:
    importance = np.linalg.norm(clf.coef_, axis=0, ord=1)
    #方法二:
    #coef=np.abs(clf.coef_)
    #importance=np.sum(coef,axis=0)
    #print(importance)

    mean = np.mean(importance)
    #print(mean)
    #median=np.median(importance)
    #print(median)

    #model=SelectFromModel(clf,prefit=True)
    model = SelectFromModel(clf, prefit=True, threshold=2.0 * mean)
    '''
	model=SelectFromModel(estimator=clf).fit(X_train, y_train)
	importance=model.estimator_.coef_
	threshold=model.threshold_
	print(threshold)
	'''
    #threshold : 阈值,string, float, optional default None
    #可以使用:median 或者 mean 或者 1.25 * mean 这种格式。
    #如果使用参数惩罚设置为L1,则使用的阈值为1e-5,否则默认使用用mean
    X_train = model.transform(X_train)
    f_dim = X_train.shape[1]
    print(f_dim)
    y_train = np_utils.to_categorical(y_train)
    return X_train, y_train, f_dim
Пример #5
0
def prepareData():
    target = pd.read_csv("dt-data.txt",
                         names=[
                             'Size', 'Occupied', 'Price', 'Music', 'Location',
                             'VIP', 'Favorite Beer', 'Enjoy'
                         ],
                         skipinitialspace=True,
                         skiprows=[0],
                         index_col=False)
    target['Size'] = target['Size'].str.replace('\d+:', '')
    target['Enjoy'] = target['Enjoy'].str.replace(';', '')
    from sklearn import preprocessing
    label_processor = preprocessing.LabelEncoder()
    target.VIP = label_processor.fit_transform(target.VIP)
    vipKeys = {}
    for val in target.VIP.unique():
        vipKeys[val] = label_processor.inverse_transform(val)
    target.Enjoy = label_processor.fit_transform(target.Enjoy)
    target.Size = label_processor.fit_transform(target.Size)
    sizeKeys = {}
    for val in target.Size.unique():
        sizeKeys[val] = label_processor.inverse_transform(val)
    target.Occupied = label_processor.fit_transform(target.Occupied)
    occKeys = {}
    for val in target.Occupied.unique():
        occKeys[val] = label_processor.inverse_transform(val)
    target.Price = label_processor.fit_transform(target.Price)
    priceKeys = {}
    for val in target.Price.unique():
        priceKeys[val] = label_processor.inverse_transform(val)
    target.Music = label_processor.fit_transform(target.Music)
    musicKeys = {}
    for val in target.Music.unique():
        musicKeys[val] = label_processor.inverse_transform(val)
    target.Location = label_processor.fit_transform(target.Location)
    locKeys = {}
    for val in target.Location.unique():
        locKeys[val] = label_processor.inverse_transform(val)
    target['Favorite Beer'] = label_processor.fit_transform(
        target['Favorite Beer'])
    beerKeys = {}
    for val in target['Favorite Beer'].unique():
        beerKeys[val] = label_processor.inverse_transform(val)
    global inverseKeys
    inverseKeys = {
        'Size': sizeKeys,
        'Occupied': occKeys,
        'Price': priceKeys,
        'Music': musicKeys,
        'Location': locKeys,
        'VIP': vipKeys,
        'Favorite Beer': beerKeys
    }

    # -------------------------------------------Tennis Data----------------------------------------------------------------
    #     target = pd.read_csv("tennis.csv",
    #                          names=['outlook', 'temp', 'humidity', 'windy', 'play'],
    #                          skipinitialspace=True, skiprows=[0], index_col=False)
    #     from sklearn import preprocessing
    #     label_processor = preprocessing.LabelEncoder()
    #     target.outlook = label_processor.fit_transform(target.outlook)
    #     outlookKeys = {}
    #     for val in target.outlook.unique():
    #         outlookKeys[val]=label_processor.inverse_transform(val)
    #     target.temp = label_processor.fit_transform(target.temp)
    #     tempKeys = {}
    #     for val in target.temp.unique():
    #         tempKeys[val] = label_processor.inverse_transform(val)
    #     target.humidity = label_processor.fit_transform(target.humidity)
    #     humKeys = {}
    #     for val in target.humidity.unique():
    #         humKeys[val] = label_processor.inverse_transform(val)
    #     target.windy = label_processor.fit_transform(target.windy)
    #     winKeys = {}
    #     for val in target.humidity.unique():
    #         winKeys[val] = label_processor.inverse_transform(val)
    #     target.play = label_processor.fit_transform(target.play)
    #     global inverseKeys
    #     inverseKeys = {"outlook":outlookKeys, "temp":tempKeys, "humidity":humKeys, "windy":winKeys}
    return target
def fxn():
    #read in the data
    df = pd.read_csv('data.csv')

    #columns to drop
    df = df.drop(['id'], axis=1)

    df.sample(frac=1)
    #gets rid of ? and one hot encoding for all columns that need it
    index = []
    count = 0
    for val in range(len(df.ix[:, 0])):
        flag = False
        for column in df:
            if df[column][val] == '?':
                flag = True
                break
        if flag:
            continue
        if count < 1000:
            index.append(val)
            count += 1
    df = df[df.index.isin(index)]

    #gets all columns which are not ints and integer encodes them
    obj_df = df.select_dtypes(include=['object']).copy()
    for column in obj_df:
        le = preprocessing.LabelEncoder()
        le.fit(df[column])
        df[column] = le.transform(df[column])

    #normalize all points between [0,1]
    x = df.values
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df = pd.DataFrame(x_scaled)

    # In[589]:

    #make dataset only 1100
    #create 500/500 split between labelled on nonlablled array, 1000 semi-sup data set, and 100 validation dataset
    train, test = np.split(df.sample(frac=1), [int(.8 * len(df))])
    #print(train)
    train = train.values.tolist()
    test = test.values.tolist()

    df_unsupervised = []

    label_nolabels = {}
    for point in train:
        #unlablled 1000 points data
        df_unsupervised.append(point[1:])
        label_nolabels[tuple(point[1:])] = [point[0]]

    # In[590]:

    ##### #kmeans_forest 1-10, unsupervised learning adaboosting
    # kmeans1 = KMeans(n_clusters=2).fit(df_unsupervised)
    # # #kmeans2 = SpectralClustering(n_clusters = 2).fit_predict(df_unsupervised).tolist()
    # # kmeans3 = MeanShift().fit(df_unsupervised)
    # # #kmeans4 = AgglomerativeClustering(n_clusters=2).fit_predict(df_unsupervised).tolist()
    # # kmeans5 = DBSCAN().fit_predict(df_unsupervised).tolist()
    # # kmeans6 = GaussianMixture(n_components=2).fit(df_unsupervised)
    # # kmeans7 = Birch(n_clusters=2).fit(df_unsupervised)
    # # kmeans8 = BayesianGaussianMixture(n_components=2).fit(df_unsupervised)
    # classifiers = [kmeans1, kmeans3, kmeans5, kmeans6, kmeans7, kmeans8]
    #kmeans_forest 1-10, unsupervised learning adaboosting
    kmeans1 = KMeans(n_clusters=2, init='random',
                     n_init=10).fit(np.asarray(df_unsupervised))
    kmeans2 = KMeans(n_clusters=2, init='random',
                     n_init=10).fit(np.asarray(df_unsupervised))
    kmeans3 = KMeans(n_clusters=2, init='random',
                     n_init=10).fit(np.asarray(df_unsupervised))
    kmeans4 = KMeans(n_clusters=2, init='random',
                     n_init=10).fit(np.asarray(df_unsupervised))
    kmeans5 = KMeans(n_clusters=2, init='random',
                     n_init=10).fit(np.asarray(df_unsupervised))
    kmeans6 = KMeans(n_clusters=2, init='random',
                     n_init=10).fit(np.asarray(df_unsupervised))
    kmeans7 = KMeans(n_clusters=2, init='random',
                     n_init=10).fit(np.asarray(df_unsupervised))
    kmeans8 = KMeans(n_clusters=2, init='random',
                     n_init=10).fit(np.asarray(df_unsupervised))
    kmeans9 = KMeans(n_clusters=2, init='random',
                     n_init=10).fit(np.asarray(df_unsupervised))
    kmeans10 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans11 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans12 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans13 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans14 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans15 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans16 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans17 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans18 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans19 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans20 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans21 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans22 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans23 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans24 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans25 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans26 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans27 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans28 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans29 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans30 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans31 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans32 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans33 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans34 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans35 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans36 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans37 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans38 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans39 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans40 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans41 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans42 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans43 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans44 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans45 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans46 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans47 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans48 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans49 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    kmeans50 = KMeans(n_clusters=2, init='random',
                      n_init=10).fit(np.asarray(df_unsupervised))
    classifiers = [
        kmeans1, kmeans2, kmeans3, kmeans4, kmeans5, kmeans6, kmeans7, kmeans8,
        kmeans9, kmeans10, kmeans11, kmeans12, kmeans13, kmeans14, kmeans15,
        kmeans16, kmeans17, kmeans18, kmeans19, kmeans20, kmeans21, kmeans22,
        kmeans23, kmeans24, kmeans25, kmeans26, kmeans27, kmeans28, kmeans29,
        kmeans30, kmeans31, kmeans32, kmeans33, kmeans34, kmeans35, kmeans36,
        kmeans37, kmeans38, kmeans39, kmeans40, kmeans41, kmeans42, kmeans43,
        kmeans44, kmeans45, kmeans46, kmeans47, kmeans48, kmeans49, kmeans50
    ]

    # In[591]:

    # make csv in form of rowNumber, clfNumber, clf prediction on that row
    answers = []
    for point in range(len(df_unsupervised)):
        for clf in range(len(classifiers)):
            answers.append([
                point, clf, classifiers[clf].predict([df_unsupervised[point]])
            ])

    count = 0
    f = open("answer_file.csv", "w")
    f.write('question,worker,answer;\n')
    for answer in answers:
        count += 1
        f.write(
            str(answer[0]) + ',' + str(answer[1]) + ',' + str(int(answer[2])) +
            '\n')
    f.close()
    p = open("result_file.csv", "w")
    p.close()

    # In[592]:

    #run VI BP
    import subprocess
    subprocess.call([
        "python", "run.py", "methods/c_EM/method.py", "answer_file.csv",
        "result_file.csv", "decision-making"
    ])

    # In[593]:

    #extract results, get noisy labels and
    filepath = "result_file.csv"
    noisy_labels = []
    with open(filepath) as fp:
        for line in fp:
            questionAnswer = line.split(',')
            noisy_labels.append(questionAnswer)

    # In[594]:

    #assign noisy label to proper row
    df_noise_x = []
    df_noise_y = []
    for question in noisy_labels:
        if question[0].rstrip() == 'question':
            continue
        df_noise_x += [df_unsupervised[int(question[0].rstrip())]]
        df_noise_y.append(int(question[1].rstrip()))
    count_vi = 0
    for el in range(len(df_noise_x)):
        if label_nolabels[tuple(df_noise_x[el])][0] != df_noise_y[el]:
            count_vi += 1
    print(count_vi, len(df_noise_x))

    # In[595]:

    df_noise_y2 = []
    for el in df_noise_y:
        df_noise_y2.append(int(el))

    df_noise = []
    for el in range(len(df_noise_x)):
        new = df_noise_x[el]
        new.append(df_noise_y2[el])
        df_noise.append(new)

    #need to shuffle the data
    random.shuffle(df_noise)

    df_noise_x = []
    df_noise_y = []
    for row in df_noise:
        df_noise_x.append(row[:-1])
        df_noise_y.append(row[-1:][0])

    # In[596]:

    #run AdaBoost from Sklearn on noisy data
    bdt2 = AdaBoostClassifier(DecisionTreeClassifier(),
                              algorithm="SAMME",
                              n_estimators=20)
    bdt2.fit(df_noise_x, df_noise_y)

    # In[597]:

    #Ada boosting on noisy data error rate
    errors = []
    count1 = 0
    for point in test:
        est = bdt2.predict([point[:-1]])
        true = int(point[-1:][0])
        est = int(est[0])
        if est == true:
            errors.append([point[:-1], 0])
        else:
            count1 += 1
            errors.append([point[:-1], 1])

            # error rate, noisy -> baseline
    return (count1 / len(test))
 def NominalToNumeric(self):
     l_pre = preprocessing.LabelEncoder()
     self.dataset = self.dataset.apply(l_pre.fit_transform)
Пример #8
0
def process_reference_log(parameters, verbose):
    output_path = os.path.join(parameters['output_folder'],
                               parameters['event_log'])

    if not os.path.exists(output_path):
        os.mkdir(
            os.path.join(parameters['output_folder'], parameters['event_log']))
    else:
        print("The directory for the event log already exists!")

    parameters['output_folder'] = output_path

    reference_log_df = analyzer.load_reference_log(parameters['reference_log'])
    max_trace_length, n_caseid, n_activity, activities = analyzer.prescriptive_analysis(
        reference_log_df)

    parameters['max_trace_length'] = max_trace_length
    parameters['n_caseid'] = n_caseid
    parameters['n_activities'] = n_activity
    parameters['activities'] = activities

    if verbose > 1:
        # Print the distribution of the activities and store the plot in the output folder
        activities_counted = analyzer.get_activity_distribution(
            reference_log_df, activities)
        plotting.plot_barchart_from_dictionary(
            activities_counted,
            "Activity Distribution Reference Log (" + parameters['event_log'] +
            ")",
            "Activity",
            "Number of Occurrence",
            save=True,
            output_file=parameters['output_folder'])

    # Extract labels (i.e. names of activities that occur as labels) for the data set and encode them
    reference_y = example_creator.get_label(
        reference_log_df.groupby('CaseID').agg({'Activity':
                                                lambda x: list(x)}))

    # Calculate imbalance degree
    imbalance.calculate_imbalance_degree(reference_y)

    if verbose > 1:
        # Print the distribution of the labels of the reference log
        labels_counted = analyzer.get_label_distribution(
            reference_y, set(reference_y))
        plotting.plot_barchart_from_dictionary(
            labels_counted,
            "Label Distribution Reference Log (" + parameters['event_log'] +
            ")",
            "Label",
            "Number of Occurrence",
            save=True,
            output_file=parameters['output_folder'])

    # Encode the labels extracted from the reference log and export them to the output folder
    label_encoder = preprocessing.LabelEncoder()
    label_encoder.fit(reference_y)
    support.export_encoding(parameters['output_folder'], label_encoder)
    le_name_mapping = dict(
        zip(label_encoder.classes_,
            label_encoder.transform(label_encoder.classes_)))
    parameters['encoding'] = le_name_mapping
    if verbose > 1:
        print("Encoding Mapping: ")
        print(parameters['encoding'])

    # Encode the the reference training samples
    reference_y_enc = label_encoder.transform(reference_y)

    # Depending on the variant calculate a cost matrix
    if parameters['cost'] == 'COST_SUM' or parameters[
            'cost'] == 'OPTIMIZED_COST' or parameters[
                'cost'] == 'APPROXIMATE_COST':
        cost_matrix = cm.calculate_cost_matrix(reference_y_enc)
        print(cost_matrix)
    else:
        cost_matrix = None

    reference_y_enc = np.asarray(reference_y_enc)
    reference_y_one_hot = np_utils.to_categorical(reference_y_enc,
                                                  label_encoder.classes_.size)

    all_labels_enc = set(reference_y_enc)
    parameters['labels_enc'] = all_labels_enc
    parameters['labels'] = set(reference_y)

    return reference_y_one_hot, cost_matrix, parameters
Пример #9
0
def label_X(X_train, X_dev, X_test):
    le = preprocessing.LabelEncoder()
    X_train = label(le, X_train)
    X_dev = label(le, X_dev)
    X_test = label(le, X_test)
    return X_train, X_dev, X_test
    data1['STARTING_LATITUDE'].fillna(data1['STARTING_LATITUDE'].mean(),
                                      inplace=True)
    data1['TIMESTAMP'] = pd.to_datetime(data1['TIMESTAMP'])
    data1['TIMESTAMP'] = (data1['TIMESTAMP'] -
                          data1['TIMESTAMP'].min()) / np.timedelta64(1, 'D')
    data1['STARTING_LONGITUDE'].fillna(data1['STARTING_LONGITUDE'].mean(),
                                       inplace=True)
    data1['DESTINATION_LATITUDE'].fillna(data1['DESTINATION_LATITUDE'].mean(),
                                         inplace=True)
    data1['DESTINATION_LONGITUDE'].fillna(
        data1['DESTINATION_LONGITUDE'].mean(), inplace=True)
    data1['TOTAL_LUGGAGE_WEIGHT'].fillna(0.0, inplace=True)
    data1['WAIT_TIME'].fillna(0.0, inplace=True)

    lbl1 = preprocessing.LabelEncoder()
    lbl1.fit(list(data1['VEHICLE_TYPE'].values))
    data1['VEHICLE_TYPE'] = lbl1.transform(list(data1['VEHICLE_TYPE'].values))

    #data1.hist()
    #plt.show()
    fancy = data1.corr()
    fancy.to_csv('correlation1.csv')

    y1 = data1['FARE']
    del data1['FARE']
    del data1['ID']
    X1 = data1

    data2['STARTING_LATITUDE'].fillna(data2['STARTING_LATITUDE'].mean(),
                                      inplace=True)
def classify(algorithm, fname, input_data, label_name, n_cores, random_state):
    train_y = np.array(input_data[label_name])
    input_data = input_data.drop('ID', axis=1)
    training_x = input_data.drop(label_name, axis=1)

    le = preprocessing.LabelEncoder()
    le.fit(train_y)
    train_y = le.transform(train_y)

    cv_metrics = pd.DataFrame()

    # 10-fold cross validation
    predicted_n_actual_pd = pd.DataFrame(
        columns=['ID', 'predicted', 'actual', 'fold'])

    kf = KFold(n_splits=10, shuffle=True, random_state=random_state)
    fold = 1

    for train, test in kf.split(training_x):
        # number of train and test instances is based on training_x.

        train_cv_features, test_cv_features, train_cv_label, test_cv_label = training_x.iloc[
            train], training_x.iloc[test], train_y[train], train_y[test]

        if algorithm == 'GB':
            temp_classifier = GradientBoostingClassifier(n_estimators=300,
                                                         random_state=1)

        elif (algorithm == 'RF'):
            temp_classifier = RandomForestClassifier(n_estimators=300,
                                                     random_state=1,
                                                     n_jobs=n_cores)

        elif (algorithm == 'M5P'):
            temp_classifier = ExtraTreesClassifier(n_estimators=300,
                                                   random_state=1,
                                                   n_jobs=n_cores)

        elif (algorithm == 'KNN'):
            temp_classifier = KNeighborsClassifier(n_neighbors=3,
                                                   n_jobs=n_cores)

        elif (algorithm == 'NEURAL'):
            temp_classifier = MLPClassifier(random_state=1)

        temp_classifier.fit(train_cv_features, train_cv_label)
        temp_prediction = temp_classifier.predict(test_cv_features)

        predicted_n_actual_pd = predicted_n_actual_pd.append(pd.DataFrame({
            'ID':
            test,
            'actual':
            test_cv_label,
            'predicted':
            temp_prediction,
            'fold':
            fold
        }),
                                                             ignore_index=True,
                                                             sort=True)

        fold += 1

    try:
        roc_auc = round(
            roc_auc_score(predicted_n_actual_pd['actual'].to_list(),
                          predicted_n_actual_pd['predicted'].to_list()), 3)

    except ValueError:
        roc_auc = 0.0

    matthews = round(
        matthews_corrcoef(predicted_n_actual_pd['actual'].to_list(),
                          predicted_n_actual_pd['predicted'].to_list()), 3)
    balanced_accuracy = round(
        balanced_accuracy_score(predicted_n_actual_pd['actual'].to_list(),
                                predicted_n_actual_pd['predicted'].to_list()),
        3)
    f1 = round(
        f1_score(predicted_n_actual_pd['actual'].to_list(),
                 predicted_n_actual_pd['predicted'].to_list()), 3)

    try:
        tn, fp, fn, tp = confusion_matrix(
            predicted_n_actual_pd['actual'].to_list(),
            predicted_n_actual_pd['predicted'].to_list()).ravel()

    except:
        tn, fp, fn, tp = 0, 0, 0, 0

    cv_metrics = cv_metrics.append(pd.DataFrame(np.column_stack(['cv',roc_auc, matthews,\
        balanced_accuracy, f1, tn, fp, fn, tp]),\
        columns=['type','roc_auc','matthew','bacc','f1','TN','FP','FN','TP']), ignore_index=True, sort=True)

    cv_metrics = cv_metrics.round(3)
    cv_metrics = cv_metrics.astype({
        'TP': 'int64',
        'TN': 'int64',
        'FP': 'int64',
        'FN': 'int64'
    })
    cv_metrics = cv_metrics[[
        'type', 'matthew', 'f1', 'bacc', 'roc_auc', 'TP', 'TN', 'FP', 'FN'
    ]]

    predicted_n_actual_pd['predicted'] = le.inverse_transform(
        predicted_n_actual_pd['predicted'].to_list())
    predicted_n_actual_pd['actual'] = le.inverse_transform(
        predicted_n_actual_pd['actual'].to_list())
    fname_predicted_n_actual_pd = os.path.join(
        output_result_dir, 'cv_{}_predited_data.csv'.format(algorithm))
    predicted_n_actual_pd['ID'] = predicted_n_actual_pd['ID'] + 1
    predicted_n_actual_pd = predicted_n_actual_pd.sort_values(by=['ID'])
    predicted_n_actual_pd.to_csv(fname_predicted_n_actual_pd, index=False)

    return cv_metrics
Пример #12
0
def read_files(tarfname):
    """Read the training and development data from the sentiment tar file.
    The returned object contains various fields that store sentiment data, such as:

    train_data,dev_data: array of documents (array of words)
    train_fnames,dev_fnames: list of filenames of the doccuments (same length as data)
    train_labels,dev_labels: the true string label for each document (same length as data)

    The data is also preprocessed for use with scikit-learn, as:

    count_vec: CountVectorizer used to process the data (for reapplication on new data)
    trainX,devX: array of vectors representing Bags of Words, i.e. documents processed through the vectorizer
    le: LabelEncoder, i.e. a mapper from string labels to ints (stored for reapplication)
    target_labels: List of labels (same order as used in le)
    trainy,devy: array of int labels, one for each document
    """
    import tarfile
    tar = tarfile.open(tarfname, "r:gz")
    trainname = "train.tsv"
    devname = "dev.tsv"
    for member in tar.getmembers():
        if 'train.tsv' in member.name:
            trainname = member.name
            print("trainname: ", trainname)
        elif 'dev.tsv' in member.name:
            devname = member.name

    class Data:
        pass

    sentiment = Data()
    print("-- train data")
    sentiment.train_data, sentiment.train_labels = read_tsv(tar, trainname)
    print(len(sentiment.train_data))

    print("-- dev data")
    sentiment.dev_data, sentiment.dev_labels = read_tsv(tar, devname)
    print(len(sentiment.dev_data))
    print("-- transforming data and labels")
    ### without any vectorizer
    sentiment.trainX = sentiment.train_data
    sentiment.devX = sentiment.dev_data
    from sklearn import preprocessing
    sentiment.le = preprocessing.LabelEncoder()
    sentiment.le.fit(sentiment.train_labels)
    sentiment.target_labels = sentiment.le.classes_
    sentiment.trainy = sentiment.le.transform(sentiment.train_labels)
    sentiment.devy = sentiment.le.transform(sentiment.dev_labels)

    ## feature generation
    sentiment.train_posX, sentiment.train_negX = splitPosNegData(
        sentiment.trainX, sentiment.trainy)
    '''tfidf vectorizer'''

    # sentiment.pos_vec = TfidfVectorizer(ngram_range = (1,2))
    # sentiment.pos_vocab = sentiment.pos_vec.fit(sentiment.train_posX).vocabulary_
    # # print(sentiment.pos_vocab)
    # sentiment.neg_vec = TfidfVectorizer(ngram_range = (1, 2))
    # sentiment.neg_vocab =  sentiment.neg_vec.fit(sentiment.train_negX).vocabulary_
    # print(sentiment.neg_vocab)

    ### get pos, neg vector on train
    import pickle
    from sklearn.feature_extraction.text import TfidfVectorizer
    sentiment.tfidf_vect = TfidfVectorizer(ngram_range=(1, 2))
    print("train_data type: ", type(sentiment.train_data))
    sentiment.trainX = sentiment.tfidf_vect.fit_transform(sentiment.train_data)
    sentiment.pos_matrix = sentiment.tfidf_vect.transform(sentiment.train_posX)
    sentiment.neg_matrix = sentiment.tfidf_vect.transform(sentiment.train_negX)
    print("feature names:")
    print(len(sentiment.tfidf_vect.get_feature_names()))
    output = open('pos_neg_matrix.pkl', 'wb')
    pickle.dump([
        sentiment.pos_matrix, sentiment.neg_matrix,
        sentiment.tfidf_vect.get_feature_names()
    ], output)
    print("dump matrix done...")
    output.close()
    sentiment.devX = sentiment.tfidf_vect.transform(sentiment.dev_data)
    tar.close()
    return sentiment
Пример #13
0
df.columns

# In[5]:

X = df.drop(['default payment next month', 'target'], axis=1)

# In[6]:

encoders = {}
X_num = X.copy()

label_cols = ['sex', 'education', 'marriage', 'age', 'target']

for col in X_num.columns.tolist():
    if col in label_cols:
        encoders[col] = preprocessing.LabelEncoder().fit(X_num[col])
        X_num[col] = encoders[col].transform(X_num[col])
X_num = X_num.drop(['sex'], axis=1)

y = df['target'].copy()
s = encoders['sex'].transform(X['sex'])

# ## Create Tensors

# In[7]:

X_tensor = torch.tensor(X_num.values, device=device).double()
noise = torch.randn([X_tensor.shape[0], 5], device=device).double()
X_noised = torch.cat((X_tensor, noise), 1)

s_tensor = torch.tensor(s, device=device).double().unsqueeze(1)
Пример #14
0
from sklearn import preprocessing, metrics
from sklearn.metrics import average_precision_score
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

df = pd.read_csv("student-prf.csv", sep=';', header=0)

df = df.apply(preprocessing.LabelEncoder().fit_transform)
df = np.array(df)

selected_column = np.arange(25)
selected_column = np.append(selected_column, [28, 29])
X = df[:400, selected_column]
y = df[:400, 26]
K = np.arange(1, 23, 2)
K = np.append(K, 30)

for i in range(12):
    knn_clf = KNeighborsClassifier(n_neighbors=K[i])
    knn_clf.fit(X, y)
    predicted = knn_clf.predict(df[400:, selected_column])
    expected = df[400:, 26]
    report = metrics.classification_report(expected, predicted)
    print(report)
    print
# average is manually collected from the print statement
precision = [
    0.54, 0.53, 0.53, 0.53, 0.49, 0.50, 0.54, 0.63, 0.63, 0.63, 0.45, 0.45
]
Пример #15
0
def load_data(p=100, type="doc2vec"):
    sources = {'./negative.txt': 'DOC_NEG', './positive.txt': 'DOC_POS'}
    sentences = LabeledLineSentence(sources)

    pos_lst = np.genfromtxt('pos_lst', dtype='str')
    neg_lst = np.genfromtxt('neg_lst', dtype='str')
    pos_lst, neg_lst = set(pos_lst), set(neg_lst)
    senword_lst = pos_lst.union(neg_lst)

    X, y, words, vocab = [], [], [], []
    if type == "doc2vec":
        model = Doc2Vec(min_count=1,
                        window=10,
                        vector_size=p,
                        sample=1e-4,
                        negative=5,
                        workers=8)
        model.build_vocab(sentences.to_array())
        model.train(sentences.sentences_perm(),
                    epochs=20,
                    total_examples=model.corpus_count)
        for line in sentences.to_array():
            sen_tmp = [s for s in line[0] if s in senword_lst]
            words.append(sen_tmp)
            vocab.extend(sen_tmp)
            vocab = list(set(vocab))
            prefix_tmp = line[1]
            X.append(model[prefix_tmp][0])
            if 'POS' in prefix_tmp[0]:
                y.append(1)
            if 'NEG' in prefix_tmp[0]:
                y.append(-1)

    if type == "word2vec":
        words_word2vec = []
        for line in sentences.to_array():
            words_word2vec.append(line[0])
        model_ug_cbow = Word2Vec(sg=0,
                                 size=p,
                                 negative=5,
                                 window=2,
                                 min_count=2,
                                 workers=2,
                                 alpha=0.065,
                                 min_alpha=0.065)
        model_ug_cbow.build_vocab(words_word2vec)
        vocab_lst = set(model_ug_cbow.wv.vocab)

        for epoch in range(30):
            model_ug_cbow.train(words_word2vec,
                                total_examples=len(words_word2vec),
                                epochs=1)
            model_ug_cbow.alpha -= 0.002
            model_ug_cbow.min_alpha = model_ug_cbow.alpha

        for line in sentences.to_array():
            sen_tmp = set(line[0])
            sen_tmp = set(line[0]).intersection(vocab_lst)
            word_ave = np.array([model_ug_cbow.wv[wd] for wd in sen_tmp])
            if len(word_ave) > 0:
                word_ave = np.mean(word_ave, axis=0)
            else:
                word_ave = np.zeros(p)
            prefix_tmp = line[1]
            X.append(word_ave)
            if 'POS' in prefix_tmp[0]:
                y.append(1)
            if 'NEG' in prefix_tmp[0]:
                y.append(-1)

    if type == "googlenews":
        googlenews = KeyedVectors.load_word2vec_format(
            '../GoogleNews-vectors-negative300.bin', binary=True)
        vocab_lst = set(googlenews.vocab).intersection(senword_lst)
        for line in sentences.to_array():
            sen_tmp = [s for s in line[0] if s in vocab_lst]
            word_ave = np.array([googlenews[wd] for wd in sen_tmp])
            words.append(sen_tmp)
            vocab.extend(sen_tmp)
            vocab = list(set(vocab))
            # if len(word_ave) > 0:
            #     word_ave = np.mean(word_ave, axis=0)
            # else:
            #     word_ave = np.zeros(300)
            prefix_tmp = line[1]
            # X.append(word_ave)
            if 'POS' in prefix_tmp[0]:
                y.append(1)
            if 'NEG' in prefix_tmp[0]:
                y.append(-1)
        le = preprocessing.LabelEncoder()
        le.fit(vocab)
        vocab_num = le.transform(vocab)
        dict_emb = []
        for i in range(len(vocab_num)):
            wd = le.inverse_transform([i])
            dict_emb.append(googlenews[wd][0])

    dict_emb, y, words, vocab = np.array(dict_emb), np.array(y), np.array(
        words), np.array(vocab)
    le_lst = []
    for i in range(len(words)):
        le_lst.append(le.transform(words[i]))
    le_lst = np.array(le_lst)

    return dict_emb, y, le_lst, vocab
Пример #16
0
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

# step 2 : downloading the data : !wget -O drug200.csv https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/drug200.csv
# step 3 : reading data using Pandas data frame
my_data = pd.read_csv("drug200.csv", delimiter=",")
my_data[0:5]

# --> Pre-processing

X = my_data[['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K']].values
X[0:5]

from sklearn import preprocessing
le_sex = preprocessing.LabelEncoder()
le_sex.fit(['F','M'])
X[:,1] = le_sex.transform(X[:,1])


le_BP = preprocessing.LabelEncoder()
le_BP.fit([ 'LOW', 'NORMAL', 'HIGH'])
X[:,2] = le_BP.transform(X[:,2])


le_Chol = preprocessing.LabelEncoder()
le_Chol.fit([ 'NORMAL', 'HIGH'])
X[:,3] = le_Chol.transform(X[:,3])

X[0:5]
Пример #17
0
titanic_test.Fare[titanic_test['Fare'].isnull()] = titanic_test['Fare'].mean()
###type casting means conversion of object variables to categorical variables.
###if we look at data type of Sex , as off now it is an object or sometimes it is int also,
##it should be converted to categorical variable
titanic_test['Sex'] = titanic_test['Sex'].astype('category')
###we can check with info() method weather sex changed to categorical or not
titanic_test.info()
####like this change all categorical variables
titanic_test['Pclass'] = titanic_test['Pclass'].astype('category')
titanic_test['Embarked'] = titanic_test['Embarked'].astype('category')

#####Lebel Encoding , all input categorical variables should be converted to
#####numerics coz decision tree algorithm can not understand the categorical variables

titanic_test1 = titanic_test.copy()
##le = preprocessing.LabelEncoder() ### le is label  encoder already created in train we can use this le
le = preprocessing.LabelEncoder()  ### le is label  encoder
titanic_test1.Pclass = le.fit_transform(titanic_test1.Pclass)
titanic_test1.Sex = le.fit_transform(titanic_test1.Sex)
titanic_test1.Embarked = le.fit_transform(titanic_test1.Embarked)

x_test = titanic_test1[['Pclass', 'Sex', 'Embarked', 'Fare']]
#x_train = titanic_train1[['Fare']]

dt = joblib.load("dt_fit2.pkl")
titanic_test1['Survived'] = dt.predict(x_test)

titanic_test1.to_csv("submission.csv",
                     columns=['PassengerId', 'Survived'],
                     index=False)
Пример #18
0
def MLP(name, input_dir, best_dir, output):

    if not os.path.exists(best_dir):
        os.makedirs(best_dir)
    best_dir_dat = "/".join((best_dir, name))
    if not os.path.exists(best_dir_dat):
        os.makedirs(best_dir_dat)

    colnames = "HType,ABType,dimension,learnFac,margin,constr,LType,MLP_acc,MLP_wF1,MLP_epoch"
    with open(output, "w") as file:
        file.write(colnames)
        file.write("\n")

    models = sorted(os.listdir(input_dir))
    for model in models:
        modelpath = "/".join((input_dir, model))
        files = sorted(os.listdir(modelpath))

        # create model subdir to store best MLP models
        best_subdir = "/".join((best_dir_dat, model))
        if not os.path.exists(best_subdir):
            os.makedirs(best_subdir)

        for i, file in enumerate(files):
            print(i)

            # embedding datasets
            labelpath = "/".join((modelpath, file))
            dataset = pd.read_csv(labelpath, index_col=0)

            # specify file path to store best MLP model [for later]
            filepath = best_subdir + "/" + file[:-4] + ".hdf5"

            ################################################################################
            ############################# DATA SPLIT ##############################
            ################################################################################

            lb = preprocessing.LabelBinarizer()
            lb.fit(list(dataset["class"]))

            X_train = dataset[dataset["split"] == "LRN"].iloc[:, 1:-2].values
            y_train = dataset[dataset["split"] == "LRN"].iloc[:, -1].values
            # get weights first
            weights = compute_class_weight("balanced", np.unique(y_train),
                                           y_train)
            # then transform
            y_train = lb.transform(y_train)

            X_valid = dataset[dataset["split"] == "VLD"].iloc[:, 1:-2].values
            y_valid = dataset[dataset["split"] == "VLD"].iloc[:, -1].values
            y_valid = lb.transform(y_valid)

            X_test = dataset[dataset["split"] == "TST"].iloc[:, 1:-2].values
            y_test = dataset[dataset["split"] == "TST"].iloc[:, -1].values
            y_test = lb.transform(y_test)

            ################################################################################
            ############################# CLASSIFIER STRUCTURE ##############################
            ################################################################################

            classifier = Sequential()

            dim = len(dataset.iloc[0, 1:-2])
            nodes = dim * 2

            # Hidden layer
            classifier.add(
                Dense(nodes,
                      activation="sigmoid",
                      kernel_initializer="uniform",
                      input_dim=dim))

            # Output layer
            classifier.add(
                Dense(9, activation="softmax", kernel_initializer="uniform"))

            # compile the model
            sgd = optimizers.SGD(lr=0.01,
                                 decay=0.0,
                                 momentum=0.0,
                                 nesterov=False)
            classifier.compile(optimizer=sgd,
                               loss="categorical_crossentropy",
                               metrics=["accuracy"])

            ################################################################################
            ############################# MODEL FITTING ##############################
            ################################################################################

            # checkpoint best model
            checkpoint = ModelCheckpoint(filepath,
                                         monitor="val_acc",
                                         verbose=0,
                                         save_best_only=True,
                                         mode="auto")

            # model settings and fit
            history = classifier.fit(X_train, y_train, validation_data=(X_valid, \
            y_valid), epochs=5000, verbose=0, callbacks=[checkpoint], \
            class_weight=weights)

            ################################################################################
            ############################# MAKE PREDICTIONS ##############################
            ################################################################################

            #load best model
            final_model = load_model(filepath)

            # get accuracy
            scores = final_model.evaluate(X_test, y_test, verbose=0)

            # get weighted F1-by-class
            le = preprocessing.LabelEncoder()
            le.fit(list(dataset["class"]))
            y_test2 = dataset[dataset["split"] == "TST"].iloc[:, -1].values
            y_test2 = le.transform(y_test2)
            y_pred = final_model.predict_classes(X_test, verbose=0)
            weighted_f1 = f1_score(y_test2, y_pred, average="weighted")

            # get best epoch
            acc_history = history.history["val_acc"]
            best_epoch = acc_history.index(max(acc_history)) + 1

            K.clear_session()  # destroy TF graph to avoid loop slowing down

            ################################################################################
            ############################# ASSEMBLE W/ CONFIG ##############################
            ################################################################################

            # get model type (H1-4, A/B)
            modelType = model.split("-")[1]  # ["H1A"]
            HType = modelType[0:2]
            ABType = modelType[-1]
            # get dimension
            filenamesplit = file.split("-")
            dimension = int([s for s in filenamesplit if "D00" in s][0][1:])
            # get learnFac
            learnFac = int([s for s in filenamesplit if "LF0" in s][0][3:])
            # get margin
            margin = float([s for s in filenamesplit if "LM" in s][0][2:])
            # get constraint
            constr = [s for s in filenamesplit
                      if "_VALUE" in s][0][:-6].lower()
            # get LType
            LType = filenamesplit[-1][:2]

            with open(output, "a") as file:
                file.write("%s,%s,%d,%d,%.1f,%s,%s,%.17f,%.17f,%d" %
                           (HType, ABType, dimension, learnFac, margin, constr,
                            LType, scores[1], weighted_f1, best_epoch))
                file.write("\n")
Пример #19
0
train['price_doc'] = train['price_doc'] * mult
y_train = train["price_doc"]

#########################################################################################################
print('Running Model 1...')
x_train = train.drop(["id", "timestamp", "price_doc", "average_q_price"],
                     axis=1)
#x_test = test.drop(["id", "timestamp", "average_q_price"], axis=1)
x_test = test.drop(["id", "timestamp"], axis=1)

num_train = len(x_train)
x_all = pd.concat([x_train, x_test])

for c in x_all.columns:
    if x_all[c].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(x_all[c].values))
        x_all[c] = lbl.transform(list(x_all[c].values))

x_train = x_all[:num_train]
x_test = x_all[num_train:]

xgb_params = {
    'eta': 0.05,
    'max_depth': 6,
    'subsample': 0.6,
    'colsample_bytree': 1,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}
Пример #20
0
def main():
    nRowsRead = 10  # specify 'None' if want to read whole file
    # KDD training data 125974
    df = pd.read_csv('NIDS\KDDAll.txt', delimiter=',', nrows=125974)
    df = df[[
        'duration', 'protocol_type', 'service', 'src_bytes', 'dst_bytes',
        'num_failed_logins', 'serror_rate', 'srv_serror_rate', 'rerror_rate',
        'srv_rerror_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
        'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label'
    ]]

    le = preprocessing.LabelEncoder()
    df = df.apply(le.fit_transform)

    #print(df.head(10))
    x = df.drop('label', axis=1)
    y = df['label']

    train_X, test_X, train_Y, test_Y = train_test_split(x,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=40)

    #Naive Bayes
    gnb = GaussianNB()
    #random forest
    rf = RandomForestClassifier(n_estimators=100, bootstrap=True)
    #KNN
    knn = KNeighborsClassifier(n_neighbors=1)
    #dtree
    dt = DecisionTreeClassifier()
    #voting classifer
    #hard voting - Majority
    #vclf =VotingClassifier(estimators=[('gnb', gnb), ('rf', rf), ('knn', knn),('dt',dt)], voting='hard')
    #vclf = vclf.fit(x,y)
    #pred_Y = vclf.predict(test_X)
    #print(classification_report(test_Y,pred_Y))
    #CM = confusion_matrix(test_Y,pred_Y)
    #TN = CM[0][0]
    #FN = CM[1][0]
    #TP = CM[1][1]
    #FP = CM[0][1]
    #FPR = FP/(FP + TN)
    #print("FPR:",FPR)
    #plot_confusion_matrix(vclf,test_X,test_Y)
    #plt.show()

    #hard voting - Mean
    vclf = VotingClassifier(estimators=[('gnb', gnb), ('rf', rf), ('knn', knn),
                                        ('dt', dt)],
                            voting='soft')
    vclf = vclf.fit(x, y)
    pred_Y = vclf.predict(test_X)
    print(classification_report(test_Y, pred_Y))
    CM = confusion_matrix(test_Y, pred_Y)
    TN = CM[0][0]
    FN = CM[1][0]
    TP = CM[1][1]
    FP = CM[0][1]
    FPR = FP / (FP + TN)
    print("FPR:", FPR)
    plot_confusion_matrix(vclf, test_X, test_Y)
    plt.show()
Пример #21
0
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import metrics
salarydata_train.columns
salarydata_test.columns
salarydata_train.shape
salarydata_test.shape
salarydata_train.isnull().sum
salarydata_test.isnull().sum
salarydata_train.head
salarydata_test.head
salary_columns=['workclass', 'education', 'maritalstatus', 'occupation', 'relationship', 'race', 'sex', 'native']
from sklearn import preprocessing
number = preprocessing.LabelEncoder()
for i in salary_columns:
    salarydata_train[i] = number.fit_transform(salarydata_train[i])
    salarydata_test[i] = number.fit_transform(salarydata_test[i])
colnames = salarydata_train.columns
len(colnames[0:13])
trainX=salarydata_train[colnames[0:13]]
trainY=salarydata_train[colnames[13]]
testX=salarydata_test[colnames[0:13]]
testY=salarydata_test[colnames[13]]

ignb = GaussianNB() # normal distribution
pred_gnb = ignb.fit(trainX,trainY).predict(testX)

confusion_matrix(testY,pred_gnb)
#([[10759,   601],
Пример #22
0
def create_synthetic_dataset(n_samples=500,
                             cube_size=32,
                             template_size=7,
                             density_min=.1,
                             density_max=.5,
                             proportions=[0.3, 0.7]):
    '''
    Creates a basic 3D texture synthetic dataset.
    Returns the volumes X and labels y.
    n_samples: number of samples per class (default 500)
    cube_size: size of the cubes, i.e. training andtest volumes (default 32)
    template_size: size of the templates rotated and pasted in the volumes (default 7)
    density_min: minimum density of patterns (default 0.1)
    density_max: maximum density of patterns (default 0.5)
    proportions: proportion of template 1 for the two classes (the proportion of template 2 is 1-p) (default= [0.3,0.7])
    '''
    np.random.seed(seed=0)
    # number of classes (only designed for 2 classes here)
    n_class = 2
    # Rotation range
    rot = 360
    range_rot = [0, rot]
    # Generate empty templates
    template = np.zeros((2, template_size, template_size, template_size))
    # Fill the templates
    # For now a simple line for t1
    template[0,
             int(template_size / 2) - 1:int(template_size / 2) + 1,
             int(template_size / 2) - 1:int(template_size / 2) + 1, :] = 1
    # And a cross for t2
    template[1,
             int(template_size / 2) - 1:int(template_size / 2) + 1,
             int(template_size / 2) - 1:int(template_size / 2) + 1,
             int(template_size / 4):int(3 * template_size / 4) + 1] = 1
    template[1,
             int(template_size / 2) - 1:int(template_size / 2) + 1,
             int(template_size / 4):int(3 * template_size / 4),
             int(template_size / 2) - 1:int(template_size / 2) + 1] = 1
    # Initialize dataset lists
    X = []
    y = []

    for c in range(n_class):
        for s in range(n_samples):
            # Generate an empty 64x64x64 cube
            cube = np.zeros((cube_size, cube_size, cube_size))
            # Generate random density
            density = np.random.uniform(density_min, density_max)
            # Number of patterns in volume based on the density
            n_templates = int((cube_size**3) / (template_size**3) * density)
            # Crop size after rotation:
            crop_size = int(template_size * np.sqrt(3))
            # place the rotated patterns in the cube
            for t in range(n_templates):
                # random position
                position = np.array([
                    np.random.choice(cube_size),
                    np.random.choice(cube_size),
                    np.random.choice(cube_size)
                ])
                # is it template 1 or 2:
                template_type = np.random.choice(
                    2, p=[proportions[c], 1 - proportions[c]])
                # Rotate the template 1 or 2
                random_angles = [
                    np.random.uniform(range_rot[0], range_rot[1])
                    for i in range(3)
                ]
                rotated_template = apply_affine_transform_fixed(
                    template[template_type], random_angles)
                # copy the rotated template in the cube
                cube = copy_template(cube, rotated_template, position)
            X.append(cube)
            y.append(c)
    X = np.expand_dims(np.asarray(X), axis=-1)
    y = np.asarray(y)
    le = preprocessing.LabelEncoder()
    le.fit(np.unique(y))
    y = le.transform(y)
    return X, y
	user_file = "../../Data/user_list.csv"
	test_pred_file = "test_predictions_xgb_dep14_child18_eta05_round450_seed0_trainseed1234.csv"

	train = pd.read_csv(train_file)
	users_list = np.array(pd.read_csv(user_file)["USER_ID_hash"]).astype('str')
	print train.shape

	print "Label encomding.."
	#col_names = ["UserPrefName", "CouponCapsuleText", "CouponGenreName", "CouponLargeAreaName", "CouponSmallAreaName", "CouponKenName"]
	#train = train.drop(col_names, axis=1)

	#le_UserIDHash = preprocessing.LabelEncoder()
	#le_UserIDHash.fit(users_list)
        #train["USER_ID_hash"] = le_UserIDHash.transform(train["USER_ID_hash"].astype("str"))

	le_UserPrefName = preprocessing.LabelEncoder()
        le_UserPrefName.fit(unique_pref_name)
        train["UserPrefName"] = le_UserPrefName.transform(train["UserPrefName"].astype('str'))

        le_CouponCapsuleText = preprocessing.LabelEncoder()
        le_CouponCapsuleText.fit(unique_capsule_text)
        train["CouponCapsuleText"] = le_CouponCapsuleText.transform(train["CouponCapsuleText"].astype('str'))

        le_CouponGenreName = preprocessing.LabelEncoder()
        le_CouponGenreName.fit(unique_genre_name)
        train["CouponGenreName"] = le_CouponGenreName.transform(train["CouponGenreName"].astype('str'))

        le_CouponLargeAreaName = preprocessing.LabelEncoder()
        le_CouponLargeAreaName.fit(unique_large_area_name)
        train["CouponLargeAreaName"] = le_CouponLargeAreaName.transform(train["CouponLargeAreaName"].astype('str'))
Пример #24
0
def main(_):

    x = tf.placeholder(tf.float32, shape=[None, 2352])
    y_ = tf.placeholder(tf.float32, shape=[None, 2])

    # First Convolution and Pooling Layer

    conv_weight_1 = weight_variable([5, 5, 3, 31])
    conv_bias_1 = bias_variable([31])

    x_image = tf.reshape(x, [-1, 28, 28, 3])
    conv_1_1 = conv2d(x_image, conv_weight_1)
    conv_1 = tf.nn.relu(conv2d(x_image, conv_weight_1) + conv_bias_1)
    pool_1 = max_pool_2x2(conv_1)

    # Second Convolution and Pooling layer

    conv_weight_2 = weight_variable([5, 5, 31, 64])
    conv_bias_2 = bias_variable([64])

    conv_2 = tf.nn.relu(conv2d(pool_1, conv_weight_2) + conv_bias_2)
    pool_2 = max_pool_2x2(conv_2)

    # First fully connected layer

    fc_weight_1 = weight_variable([7 * 7 * 64, 1024])
    fc_bias_1 = bias_variable([1024])

    pool_2_flat = tf.reshape(pool_2, [-1, 7 * 7 * 64])
    fc_1 = tf.nn.relu(tf.matmul(pool_2_flat, fc_weight_1) + fc_bias_1)

    # A drop out layer
    keep_prob = tf.placeholder(tf.float32)
    custom_fc1_drop = tf.nn.dropout(fc_1, keep_prob)

    # Second custom fully connected layer
    fc_weights_2 = weight_variable([1024, 2])
    fc_bias_2 = bias_variable([2])
    fc_2 = tf.matmul(fc_1, fc_weights_2) + fc_bias_2

    y_conv = fc_2

    cross_entropy = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv))

    train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)

    correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))

    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    with tf.Session() as sess:

        sess.run(tf.global_variables_initializer())
        file_list, y_image_label = prepare_data(FLAGS.image_dir)

        le = preprocessing.LabelEncoder()
        y_one_hot = tf.one_hot(le.fit_transform(y_image_label), depth=2)

        x_feed = sess.run(read_image_array(file_list))
        y_feed = sess.run(y_one_hot)

        for i in range(200):
            if i % 10 == 0:
                train_accuracy = accuracy.eval(feed_dict={
                    x: x_feed,
                    y_: y_feed,
                    keep_prob: 1.0
                })
                print('step %d, training accuracy %g' % (i, train_accuracy))
            train_step.run(feed_dict={x: x_feed, y_: y_feed, keep_prob: 0.8})
        predicted = tf.argmax(y_conv, 1)

        if FLAGS.predict_image <> "":
            x_single_img = sess.run(read_single_image(FLAGS.predict_image))
            print(
                "You got ",
                le.inverse_transform(
                    sess.run(predicted, feed_dict={x: x_single_img})))
Пример #25
0
def test_CNN(model,X_train,y_train,X_valid,y_valid,w_id,batch_size,num_epochs,preprocessed=False):
    num_samples = X_train.shape[0]
    num_batches = int(np.ceil(num_samples / float(batch_size)))
    l1 = preprocessing.LabelEncoder()
    t1 = l1.fit_transform(y_train)
    l2 = preprocessing.LabelEncoder()
    t2 = l2.fit_transform(y_valid)

    num_test_samples = X_valid.shape[0]
    num_test_batches = int(np.ceil(num_test_samples / float(batch_size)))

    # setting up lists for handling loss/accuracy
    train_loss, val_loss = [], []
    train_cost, val_cost = [], []
    for epoch in range(num_epochs):
        # Forward -> Backprob -> Update params
        ## Train
        correct = 0
        model.train()
        for i in range(num_batches):
            if i % 10 == 0:
                print("\n {}, still training...".format(i), end='')
            idx = range(i * batch_size, np.minimum((i + 1) * batch_size, num_samples))
            index = idx[-1]-idx[0]+1
            if preprocessed==False:
                batch_image = np.zeros((index,224,224))
                for j in range(index):
                    image_resized = resize(X_train[idx[j]], (224, 224), anti_aliasing=True)
                    batch_image[j,:,:] = image_resized
                X_batch_tr = Variable(torch.from_numpy(batch_image))
                y_batch_tr = Variable(torch.from_numpy(t1[idx]).long())
                optimizer.zero_grad()
                output = model(X_batch_tr.unsqueeze(1).float())
            else:
                X_batch_tr = X_train[idx,:,:,:]
                y_batch_tr = Variable(torch.from_numpy(t1[idx]).long())
                optimizer.zero_grad()
                output = model(X_batch_tr.float())

            batch_loss = criterion(output, y_batch_tr)
            train_loss.append(batch_loss.data.numpy())

            batch_loss.backward()
            optimizer.step()

            preds = np.argmax(output.data.numpy(), axis=-1)
            correct += np.sum(y_batch_tr.data.numpy() == preds)

        train_acc = correct / float(num_samples)
        train_cost.append(np.mean(train_loss))

        correct2 = 0
        model.eval()
        wrong_guesses = []
        wrong_predictions = []
        all_predictions = []
        for i in range(num_test_batches):
            if i % 10 == 0:
                print("\n {}, now validation...".format(i), end='')
            idx = range(i * batch_size, np.minimum((i + 1) * batch_size, num_test_samples))
            index = idx[-1] - idx[0] + 1
            if preprocessed==False:
                batch_image = np.zeros((index,224,224))
                for j in range(index):
                    image_resized = resize(X_valid[idx[j]], (224, 224), anti_aliasing=True)
                    batch_image[j,:,:] = image_resized
                X_batch_v = Variable(torch.from_numpy(batch_image))
                y_batch_v = Variable(torch.from_numpy(t2[idx]).long())
                output = model(X_batch_v.unsqueeze(1).float())
            else:
                X_batch_v = X_valid[idx,:,:,:]
                y_batch_v = Variable(torch.from_numpy(t2[idx]).long())
                output = model(X_batch_v.float())

            batch_loss = criterion(output, y_batch_v)

            val_loss.append(batch_loss.data.numpy())
            preds = np.argmax(output.data.numpy(), axis=-1)
            eval_preds = y_batch_v.data.numpy() == preds
            for k in range(index):
                if eval_preds[k] == False:
                    wrong_guesses.append(w_id[idx[k]])
                    wrong_predictions.append(preds[k])
                else:
                    correct2 += 1
                all_predictions.append(preds[k])

        val_acc = correct2 / float(num_test_samples)
        val_cost.append(np.mean(val_loss))

        if epoch % 10 == 0:
            print("\n Epoch %2i : Train Loss %f , Train acc %f, Valid acc %f" % (
                epoch + 1, train_cost[-1], train_acc, val_acc))

    return train_acc,train_cost,val_acc,val_cost, wrong_guesses, wrong_predictions, all_predictions, model
Пример #26
0
    def get_df(self):
        if self.name == 'adult':
            header = [
                'age', 'workclass', 'fnlwgt', 'education', 'education-num',
                'marital-status', 'occupation', 'relationship', 'race', 'sex',
                'capital-gain', 'capital-loss', 'hours-per-week',
                'native-country', 'income'
            ]
            df = pd.read_csv(self.file, names=header)
            df = df[df['occupation'] != ' ?']
            df = df.reset_index()
            df['income'] = (df['income'] == ' >50K')
            col_action = {
                'age': 'num',
                'workclass': 'ohe',
                'fnlwgt': 'del',
                'education': 'ohe',
                'education-num': 'num',
                'marital-status': 'ohe',
                'occupation': 'se',
                'relationship': 'ohe',
                'race': 'ohe',
                'sex': 'ohe',
                'capital-gain': 'num',
                'capital-loss': 'num',
                'hours-per-week': 'num',
                'native-country': 'ohe',
                'income': 'y'
            }
            self.clf_type = 'binary_clf'

        if self.name == 'beer_reviews':
            df = pd.read_csv(self.file)
            df.shape
            df = df.dropna(axis=0, how='any')

            # print_unique_values(df)
            col_action = {
                'brewery_id': 'del',
                'brewery_name': 'del',
                'review_time': 'del',
                'review_overall': 'del',
                'review_aroma': 'num',
                'review_appearance': 'num',
                'review_profilename': 'del',
                'beer_style': 'y',
                'review_palate': 'num',
                'review_taste': 'num',
                'beer_name': 'se',
                'beer_abv': 'del',
                'beer_beerid': 'del'
            }
            self.clf_type = 'multiclass_clf'

        if self.name == 'midwest_survey':
            df = pd.read_csv(self.file)
            # print_unique_values(df)
            col_action = {
                'RespondentID':
                'del',
                'In your own words, what would you call the part ' + 'of the country you live in now?':
                'se',
                'Personally identification as a Midwesterner?':
                'ohe',
                'Illinois in MW?':
                'ohe-1',
                'Indiana in MW?':
                'ohe-1',
                'Iowa in MW?':
                'ohe-1',
                'Kansas in MW?':
                'ohe-1',
                'Michigan in MW?':
                'ohe-1',
                'Minnesota in MW?':
                'ohe-1',
                'Missouri in MW?':
                'ohe-1',
                'Nebraska in MW?':
                'ohe-1',
                'North Dakota in MW?':
                'ohe-1',
                'Ohio in MW?':
                'ohe-1',
                'South Dakota in MW?':
                'ohe-1',
                'Wisconsin in MW?':
                'ohe-1',
                'Arkansas in MW?':
                'ohe-1',
                'Colorado in MW?':
                'ohe-1',
                'Kentucky in MW?':
                'ohe-1',
                'Oklahoma in MW?':
                'ohe-1',
                'Pennsylvania in MW?':
                'ohe-1',
                'West Virginia in MW?':
                'ohe-1',
                'Montana in MW?':
                'ohe-1',
                'Wyoming in MW?':
                'ohe-1',
                'ZIP Code':
                'del',
                'Gender':
                'ohe',
                'Age':
                'ohe',
                'Household Income':
                'ohe',
                'Education':
                'ohe',
                'Location (Census Region)':
                'y'
            }

            le = preprocessing.LabelEncoder()
            ycol = [col for col in col_action if col_action[col] == 'y']
            df[ycol] = le.fit_transform(df[ycol[0]].astype(str))
            self.clf_type = 'multiclass_clf'

        if self.name == 'indultos_espana':
            df = pd.read_csv(self.file)
            col_action = {
                'Fecha BOE': 'del',
                'Ministerio': 'ohe-1',
                'Ministro': 'ohe',
                'Partido en el Gobierno': 'ohe-1',
                'Género': 'ohe-1',
                'Tribunal': 'ohe',
                'Región': 'ohe',
                'Fecha Condena': 'del',
                'Rol en el delito': 'se',
                'Delito': 'se',
                'Año Inicio Delito': 'num',
                'Año Fin Delito': 'num',
                'Tipo de Indulto': 'y',
                'Fecha Indulto': 'del',
                'Categoría Cod.Penal': 'se',
                'Subcategoría Cod.Penal': 'se',
                'Fecha BOE.año': 'num',
                'Fecha BOE.mes': 'num',
                'Fecha BOE.día del mes': 'num',
                'Fecha BOE.día de la semana': 'num',
                'Fecha Condena.año': 'num',
                'Fecha Condena.mes': 'num',
                'Fecha Condena.día del mes': 'num',
                'Fecha Condena.día de la semana': 'num',
                'Fecha Indulto.año': 'num',
                'Fecha Indulto.mes': 'num',
                'Fecha Indulto.día del mes': 'num',
                'Fecha Indulto.día de la semana': 'num'
            }
            df['Tipo de Indulto'] = (df['Tipo de Indulto'] == 'indultar')
            self.clf_type = 'binary_clf'

        if self.name == 'docs_payments':
            # Variable names in Dollars for Docs dataset ######################
            pi_specialty = ['Physician_Specialty']
            drug_nm = ['Name_of_Associated_Covered_Drug_or_Biological1']
            #    'Name_of_Associated_Covered_Drug_or_Biological2',
            #    'Name_of_Associated_Covered_Drug_or_Biological3',
            #    'Name_of_Associated_Covered_Drug_or_Biological4',
            #    'Name_of_Associated_Covered_Drug_or_Biological5']
            dev_nm = ['Name_of_Associated_Covered_Device_or_Medical_Supply1']
            #  'Name_of_Associated_Covered_Device_or_Medical_Supply2',
            #  'Name_of_Associated_Covered_Device_or_Medical_Supply3',
            #  'Name_of_Associated_Covered_Device_or_Medical_Supply4',
            #  'Name_of_Associated_Covered_Device_or_Medical_Supply5']
            corp = [
                'Applicable_Manufacturer_or_Applicable_GPO_Making_' +
                'Payment_Name'
            ]
            amount = ['Total_Amount_of_Payment_USDollars']
            dispute = ['Dispute_Status_for_Publication']
            ###################################################################

            if os.path.exists(self.file):
                df = pd.read_hdf(self.file)
                # print('Loading DataFrame from:\n\t%s' % self.file)
            else:
                hdf_files = glob.glob(os.path.join(self.path, 'hdf', '*.h5'))
                hdf_files_ = []
                for file_ in hdf_files:
                    if 'RSRCH_PGYR2013' in file_:
                        hdf_files_.append(file_)
                    if 'GNRL_PGYR2013' in file_:
                        hdf_files_.append(file_)

                dfd_cols = pi_specialty + drug_nm + dev_nm + corp + amount + dispute
                df_dfd = pd.DataFrame(columns=dfd_cols)
                for hdf_file in hdf_files_:
                    if 'RSRCH' in hdf_file:
                        with pd.HDFStore(hdf_file) as hdf:
                            for key in hdf.keys():
                                df = pd.read_hdf(hdf_file, key)
                                df = df[dfd_cols]
                                df['status'] = 'allowed'
                                df = df.drop_duplicates(keep='first')
                                df_dfd = pd.concat([df_dfd, df],
                                                   ignore_index=True)
                                print('size: %d, %d' % tuple(df_dfd.shape))
                unique_vals = {}
                for col in df_dfd.columns:
                    unique_vals[col] = set(list(df_dfd[col].unique()))

                for hdf_file in hdf_files_:
                    if 'GNRL' in hdf_file:
                        with pd.HDFStore(hdf_file) as hdf:
                            for key in hdf.keys():
                                df = pd.read_hdf(hdf_file, key)
                                df = df[dfd_cols]
                                df['status'] = 'disallowed'
                                df = df.drop_duplicates(keep='first')
                                # remove all value thats are not in RSRCH
                                # for col in pi_specialty+drug_nm+dev_nm+corp:
                                #     print(col)
                                #     s1 = set(list(df[col].unique()))
                                #     s2 = unique_vals[col]
                                #     df = df.set_index(col).drop(labels=s1-s2)
                                #            .reset_index()
                                df_dfd = pd.concat([df_dfd, df],
                                                   ignore_index=True)
                                print('size: %d, %d' % tuple(df_dfd.shape))
                df_dfd = df_dfd.drop_duplicates(keep='first')
                df_dfd.to_hdf(self.file, 't1')
                df = df_dfd
            df['status'] = (df['status'] == 'allowed')
            # print_unique_values(df)
            col_action = {
                pi_specialty[0]: 'del',
                drug_nm[0]: 'del',
                dev_nm[0]: 'del',
                corp[0]: 'se',
                amount[0]: 'num',
                dispute[0]: 'ohe-1',
                'status': 'y'
            }
            self.clf_type = 'binary_clf'

        if self.name == 'medical_charge':
            df = pd.read_csv(self.file)
            # print_unique_values(df)
            col_action = {
                'State': 'ohe',
                'Total population': 'del',
                'Median age': 'del',
                '% BachelorsDeg or higher': 'del',
                'Unemployment rate': 'del',
                'Per capita income': 'del',
                'Total households': 'del',
                'Average household size': 'del',
                '% Owner occupied housing': 'del',
                '% Renter occupied housing': 'del',
                '% Vacant housing': 'del',
                'Median home value': 'del',
                'Population growth 2010 to 2015 annual': 'del',
                'House hold growth 2010 to 2015 annual': 'del',
                'Per capita income growth 2010 to 2015 annual': 'del',
                '2012 state winner': 'del',
                'Medical procedure': 'se',
                'Total Discharges': 'del',
                'Average Covered Charges': 'num',
                'Average Total Payments': 'y'
            }
            self.clf_type = 'regression'  # opts: 'regression',
            # 'binary_clf', 'multiclass_clf'

        if self.name == 'road_safety':
            files = self.file
            for filename in files:
                if filename.split('/')[-1] == '2015_Make_Model.csv':
                    df_mod = pd.read_csv(filename)
                    df_mod['Vehicle_Reference'] = (
                        df_mod['Vehicle_Reference'].map(str))
                    df_mod['Vehicle_Index'] = (df_mod['Accident_Index'] +
                                               df_mod['Vehicle_Reference'])
                    df_mod = df_mod.set_index('Vehicle_Index')
                    df_mod = df_mod.dropna(axis=0, how='any', subset=['make'])
            for filename in files:
                if filename.split('/')[-1] == 'Accidents_2015.csv':
                    df_acc = pd.read_csv(filename).set_index('Accident_Index')
            for filename in files:
                if filename.split('/')[-1] == 'Vehicles_2015.csv':
                    df_veh = pd.read_csv(filename)
                    df_veh['Vehicle_Reference'] = (
                        df_veh['Vehicle_Reference'].map(str))
                    df_veh['Vehicle_Index'] = (df_veh['Accident_Index'] +
                                               df_veh['Vehicle_Reference'])
                    df_veh = df_veh.set_index('Vehicle_Index')
            for filename in files:
                if filename.split('/')[-1] == 'Casualties_2015.csv':
                    df_cas = pd.read_csv(filename)
                    df_cas['Vehicle_Reference'] = (
                        df_cas['Vehicle_Reference'].map(str))
                    df_cas['Vehicle_Index'] = (df_cas['Accident_Index'] +
                                               df_cas['Vehicle_Reference'])
                    df_cas = df_cas.set_index('Vehicle_Index')

            df = df_cas.join(df_mod,
                             how='left',
                             lsuffix='_cas',
                             rsuffix='_model')
            df = df.dropna(axis=0, how='any', subset=['make'])
            df = df[df['Sex_of_Driver'] != 3]
            df = df[df['Sex_of_Driver'] != -1]
            df['Sex_of_Driver'] = df['Sex_of_Driver'] - 1
            # print_unique_values(df)
            # col_action = {'Casualty_Severity': 'y',
            #               'Casualty_Class': 'num',
            #               'make': 'ohe',
            #               'model': 'se'}
            col_action = {'Sex_of_Driver': 'y', 'model': 'se', 'make': 'ohe'}
            df = df.dropna(axis=0, how='any', subset=list(col_action.keys()))
            self.clf_type = 'binary_clf'  # opts: 'regression',
            # 'binary_clf', 'multiclass_clf'
            self.file = self.file[0]

        if self.name == 'consumer_complaints':
            df = pd.read_csv(self.file)
            # print_unique_values(df)
            col_action = {
                'Date received': 'del',
                'Product': 'ohe',
                'Sub-product': 'ohe',
                'Issue': 'ohe',
                'Sub-issue': 'ohe',
                'Consumer complaint narrative': 'se',  # too long
                'Company public response': 'ohe',
                'Company': 'se',
                'State': 'del',
                'ZIP code': 'del',
                'Tags': 'del',
                'Consumer consent provided?': 'del',
                'Submitted via': 'ohe',
                'Date sent to company': 'del',
                'Company response to consumer': 'ohe',
                'Timely response?': 'ohe-1',
                'Consumer disputed?': 'y',
                'Complaint ID': 'del'
            }
            for col in col_action:
                if col_action[col] in ['ohe', 'se']:
                    df = df.fillna(value={col: 'nan'})
            df = df.dropna(axis=0, how='any', subset=['Consumer disputed?'])
            df.loc[:,
                   'Consumer disputed?'] = (df['Consumer disputed?'] == 'Yes')
            self.clf_type = 'binary_clf'  # opts: 'regression',
            # 'binary_clf', 'multiclass_clf'

        if self.name == 'traffic_violations':
            df = pd.read_csv(self.file)
            # print_unique_values(df)
            col_action = {
                'Date Of Stop': 'del',
                'Time Of Stop': 'del',
                'Agency': 'del',
                'SubAgency': 'del',  # 'ohe'
                'Description': 'se',
                'Location': 'del',
                'Latitude': 'del',
                'Longitude': 'del',
                'Accident': 'del',
                'Belts': 'ohe-1',
                'Personal Injury': 'del',
                'Property Damage': 'ohe-1',
                'Fatal': 'ohe-1',
                'Commercial License': 'ohe-1',
                'HAZMAT': 'ohe',
                'Commercial Vehicle': 'ohe-1',
                'Alcohol': 'ohe-1',
                'Work Zone': 'ohe-1',
                'State': 'del',  #
                'VehicleType': 'del',  # 'ohe'
                'Year': 'num',
                'Make': 'del',
                'Model': 'del',
                'Color': 'del',
                'Violation Type': 'y',
                'Charge': 'del',  # 'y'
                'Article': 'del',  # 'y'
                'Contributed To Accident': 'del',  # 'y'
                'Race': 'ohe',
                'Gender': 'ohe',
                'Driver City': 'del',
                'Driver State': 'del',
                'DL State': 'del',
                'Arrest Type': 'ohe',
                'Geolocation': 'del'
            }
            for col in col_action:
                if col_action in ['ohe', 'se']:
                    df = df.fillna(value={col: 'nan'})
            self.clf_type = 'multiclass_clf'  # opts: 'regression',
            # 'binary_clf', 'multiclass_clf'

        if self.name == 'crime_data':
            df = pd.read_csv(self.file)
            # print_unique_values(df)
            col_action = {
                'DR Number': 'del',
                'Date Reported': 'del',
                'Date Occurred': 'del',
                'Time Occurred': 'del',
                'Area ID': 'del',
                'Area Name': 'del',
                'Reporting District': 'del',
                'Crime Code': 'del',
                'Crime Code Description': 'y',
                'MO Codes': 'del',  # 'se'
                'Victim Age': 'num',
                'Victim Sex': 'ohe',
                'Victim Descent': 'ohe',
                'Premise Code': 'del',
                'Premise Description': 'ohe',
                'Weapon Used Code': 'del',
                'Weapon Description': 'ohe',
                'Status Code': 'del',
                'Status Description': 'del',
                'Crime Code 1': 'del',
                'Crime Code 2': 'del',
                'Crime Code 3': 'del',
                'Crime Code 4': 'del',
                'Address': 'del',
                'Cross Street': 'se',  # 'se'
                'Location ': 'del'
            }
            for col in col_action:
                if col_action in ['ohe', 'se']:
                    df = df.fillna(value={col: 'nan'})
            self.clf_type = 'multiclass_clf'  # opts: 'regression',
            # 'binary_clf', 'multiclass_clf'

        if self.name == 'employee_salaries':
            df = pd.read_csv(self.file)
            col_action = {
                'Full Name': 'del',
                'Gender': 'ohe',
                'Current Annual Salary': 'y',
                '2016 Gross Pay Received': 'del',
                '2016 Overtime Pay': 'del',
                'Department': 'del',
                'Department Name': 'ohe',
                'Division': 'ohe',  # 'se'
                'Assignment Category': 'ohe-1',
                'Employee Position Title': 'se',
                'Underfilled Job Title': 'del',
                'Date First Hired': 'num'
            }
            df['Current Annual Salary'] = [
                float(s[1:]) for s in df['Current Annual Salary']
            ]
            df['Date First Hired'] = [
                datetime.datetime.strptime(d, '%m/%d/%Y').year
                for d in df['Date First Hired']
            ]
            for col in col_action:
                if col_action in ['ohe', 'se']:
                    df = df.fillna(value={col: 'nan'})
            self.clf_type = 'regression'  # opts: 'regression',
            # 'binary_clf', 'multiclass_clf'

        # add here info about the dataset #####################################
        if self.name == 'new_dataset':
            df = pd.read_csv(self.file)
            col_action = {}
            for col in col_action:
                if col_action in ['ohe', 'se']:
                    df = df.fillna(value={col: 'nan'})
            self.clf_type = 'multiclass_clf'  # opts: 'regression',
            # 'binary_clf', 'multiclass_clf'
        #######################################################################

        self.df = df
        self.col_action = {
            k: col_action[k]
            for k in col_action if col_action[k] != 'del'
        }  # why not but not coherent with the rest --> self.preprocess
        return self
Пример #27
0
# KNN Classification
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
filename = '../../datasets/iris_classification_train.csv'
names = [
    'sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'flower_name'
]
df = read_csv(filename, names=names)
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
df['flower_name'] = label_encoder.fit_transform(df['flower_name'])
df['flower_name'].unique()
array = df.values
inputx = array[:, 0:4]
outputy = array[:, 4]
model = KNeighborsClassifier()
print(model.fit(inputx, outputy))
filename = '../../datasets/iris_classification_test.csv'
names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
newdataframe = read_csv(filename, names=names)
array = newdataframe.values
z = array[:, 0:4]
print("\n", newdataframe, "\n")
res = model.predict(z)
reslist = []
res = model.predict(z)
print(model.predict(z), "\n")
Пример #28
0
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_

import os
import numpy as np
import pandas as pd
from pandas import read_csv
import sklearn
from sklearn import linear_model
from sklearn.utils import shuffle
from sklearn import preprocessing

data = pd.read_csv("Placement.csv")

data = data[["status", "mba_p","etest_p", "specialisation","gender", "ssc_p", "ssc_b", "hsc_p", "hsc_b", "hsc_s", "degree_p", "degree_t", "workex"]]

le = preprocessing.LabelEncoder()
data.gender = le.fit_transform(list(data["gender"]))
data.ssc_b = le.fit_transform(list(data["ssc_b"]))
data.hsc_b = le.fit_transform(list(data["hsc_b"]))
data.hsc_s = le.fit_transform(list(data["hsc_s"]))
data.degree_t = le.fit_transform(list(data["degree_t"]))
data.workex = le.fit_transform(list(data["workex"]))
data.specialisation = le.fit_transform(list(data["specialisation"]))
data.status = le.fit_transform(list(data["status"]))

predict = "status"
print (data.head())


X = np.array(data.drop([predict], 1)) ??
=> print(x) ??
Пример #30
0
def model_selection(X_train, X_test, df_labels):
    y_train = df_labels.status_group.values

    # Compare models without optimization
    models = {
        "Dumb Model":
        AlwaysFunctionalClassifier(),
        "SGD Classifier":
        SGDClassifier(),
        "Random Forests":
        RandomForestClassifier(),
        "k-Nearest Neighbors":
        KNeighborsClassifier(),
        "Softmax Regression":
        LogisticRegression(multi_class="multinomial", solver="lbfgs"),
        "SVM":
        SVC(decision_function_shape="ovr"),
        "Decission Trees":
        DecisionTreeClassifier(),
        "AdaBoost":
        AdaBoostClassifier(algorithm="SAMME.R"),
        "Gradient Boost":
        GradientBoostingClassifier()
    }

    results = []
    names = []

    for k, v in models.items():
        cv_scores = cross_val_score(estimator=v,
                                    X=X_train,
                                    y=y_train,
                                    cv=10,
                                    n_jobs=1,
                                    scoring='accuracy')

        results.append(cv_scores)
        names.append(k)

        print(k)
        print('CV accuracy: %.3f +/- %.3f' %
              (np.mean(cv_scores), np.std(cv_scores)))
        print('----------------')

    fig = plt.figure(figsize=(16, 12))
    fig.suptitle('Algorithm Comparison')
    ax = fig.add_subplot(111)
    plt.boxplot(results)
    ax.set_xticklabels(names)
    plt.show()

    # Let's try to optimize some of this models
    # Random Forests

    # Initial performance
    forest_clf = RandomForestClassifier()
    cross_val_score(forest_clf, X_train, y_train, cv=3, scoring="accuracy")

    # Random Forests Confusion Matrix
    y_train_pred = cross_val_predict(forest_clf, X_train, y_train, cv=3)
    conf_mx = confusion_matrix(y_train, y_train_pred)

    fig, ax = plt.subplots(figsize=(8, 8))
    ax.matshow(conf_mx, cmap=plt.cm.Blues, alpha=0.3)
    for i in range(conf_mx.shape[0]):
        for j in range(conf_mx.shape[1]):
            perc = str(round((conf_mx[i, j] / conf_mx.sum()) * 100, 2)) + "%"
            ax.text(x=j,
                    y=i,
                    s=str(conf_mx[i, j]) + "\n\n" + perc,
                    va='center',
                    ha='center')

    plt.xlabel('predicted label')
    plt.ylabel('true label')

    plt.tight_layout()
    plt.show()

    param_grid = [{
        'max_depth': [30, 60],
        'n_estimators': [80, 300],
        'max_features': [5, 10],
        'min_samples_leaf': [1, 10],
        'n_jobs': [-1]
    }]

    grid_search_rf = GridSearchCV(forest_clf,
                                  param_grid,
                                  cv=3,
                                  scoring='accuracy',
                                  verbose=2,
                                  n_jobs=-1)
    grid_search_rf.fit(X_train, y_train)

    cvres = grid_search_rf.cv_results_
    for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
        print(mean_score, params)

    print(grid_search_rf.best_params_)

    cv_results = cross_validate(RandomForestClassifier(**grid_search_rf.best_params_), \
                                X_train, y_train, cv = 3, scoring="accuracy")

    print(cv_results['test_score'].mean())

    # SGD Classifier
    # Initial performance
    sgd_clf = SGDClassifier()
    cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy")

    # SGD Confusion Matrix
    y_train_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=3)
    conf_mx = confusion_matrix(y_train, y_train_pred)

    fig, ax = plt.subplots(figsize=(8, 8))
    ax.matshow(conf_mx, cmap=plt.cm.Blues, alpha=0.3)
    for i in range(conf_mx.shape[0]):
        for j in range(conf_mx.shape[1]):
            perc = str(round((conf_mx[i, j] / conf_mx.sum()) * 100, 2)) + "%"
            ax.text(x=j,
                    y=i,
                    s=str(conf_mx[i, j]) + "\n\n" + perc,
                    va='center',
                    ha='center')

    plt.xlabel('predicted label')
    plt.ylabel('true label')

    plt.tight_layout()
    plt.show()

    param_grid = [{
        'penalty': ['none', 'l2', 'l1', 'elasticnet'],
        'alpha': [0.00001, 0.0001, 0.001, 0.01],
        'loss': ['log'],
        'n_jobs': [-1]
    }]

    grid_search_sgd = GridSearchCV(sgd_clf,
                                   param_grid,
                                   cv=3,
                                   scoring='accuracy',
                                   verbose=2,
                                   n_jobs=-1)
    grid_search_sgd.fit(X_train, y_train)

    cvres = grid_search_sgd.cv_results_
    for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
        print(mean_score, params)

    print(grid_search_sgd.best_params_)

    cv_results = cross_validate(SGDClassifier(**grid_search_sgd.best_params_), \
                                X_train, y_train, cv = 3, scoring="accuracy")

    print(cv_results['test_score'].mean())

    # K Nearest Neighbors
    # Initial performance

    knn_clf = KNeighborsClassifier()
    cross_val_score(knn_clf, X_train, y_train, cv=3, scoring="accuracy")

    # KNN Confusion Matrix
    y_train_pred = cross_val_predict(knn_clf, X_train, y_train, cv=3)
    conf_mx = confusion_matrix(y_train, y_train_pred)

    fig, ax = plt.subplots(figsize=(8, 8))
    ax.matshow(conf_mx, cmap=plt.cm.Blues, alpha=0.3)
    for i in range(conf_mx.shape[0]):
        for j in range(conf_mx.shape[1]):
            perc = str(round((conf_mx[i, j] / conf_mx.sum()) * 100, 2)) + "%"
            ax.text(x=j,
                    y=i,
                    s=str(conf_mx[i, j]) + "\n\n" + perc,
                    va='center',
                    ha='center')

    plt.xlabel('predicted label')
    plt.ylabel('true label')

    plt.tight_layout()
    plt.show()

    param_grid = [{
        'n_neighbors': [3, 5, 10],
        'weights': ['uniform', 'distance'],
        'n_jobs': [-1]
    }]

    grid_search_knn = GridSearchCV(knn_clf,
                                   param_grid,
                                   cv=3,
                                   scoring='accuracy',
                                   verbose=2,
                                   n_jobs=-1)
    grid_search_knn.fit(X_train, y_train)

    cvres = grid_search_knn.cv_results_
    for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
        print(mean_score, params)

    print(grid_search_knn.best_params_)

    cv_results = cross_validate(KNeighborsClassifier(**grid_search_knn.best_params_), \
                                X_train, y_train, cv = 3, scoring="accuracy")

    print(cv_results['test_score'].mean())

    # Classification with XGBoost

    param_grid = [{
        'max_depth': [3, 10],
        'n_estimators': [80, 300],
        'learning_rate': [0.01, 0.1, 0.3]
    }]

    gbm = xgb.XGBClassifier()
    grid_search_xgb = GridSearchCV(gbm,
                                   param_grid,
                                   cv=3,
                                   scoring='accuracy',
                                   verbose=2,
                                   n_jobs=-1)
    grid_search_xgb.fit(X_train, y_train)

    cvres = grid_search_xgb.cv_results_
    for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
        print(mean_score, params)

    print(grid_search_xgb.best_params_)

    cv_results = cross_validate(xgb.XGBClassifier(**grid_search_xgb.best_params_), \
                                X_train, y_train, cv = 3, scoring="accuracy")

    print(cv_results['test_score'].mean())

    # Just a bit better than Random Forests, but the best so far nevertheless.

    # Ensembling
    # Let's put together all the models shown above to see if we get a better result.
    sgd_clf = SGDClassifier(**grid_search_sgd.best_params_)
    rnd_clf = RandomForestClassifier(**grid_search_rf.best_params_)
    knn_clf = KNeighborsClassifier(**grid_search_knn.best_params_)
    log_clf = LogisticRegression(multi_class="multinomial",
                                 solver="lbfgs",
                                 C=30,
                                 n_jobs=-1)
    # We'll skip SVM as they slow down too much the modelling times
    # svm_clf = SVC(C= 1, gamma= 0.1, decision_function_shape="ovr", n_jobs=-1)
    dtr_clf = DecisionTreeClassifier(max_depth=20, min_samples_split=10)
    ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=5),
                                 n_estimators=200,
                                 algorithm="SAMME.R",
                                 learning_rate=0.5)
    gbrt_clf = GradientBoostingClassifier(max_depth=5,
                                          n_estimators=500,
                                          learning_rate=0.5)
    xgb_clf = xgb.XGBClassifier(**grid_search_xgb.best_params_)

    clfs = [
        sgd_clf, rnd_clf, knn_clf, log_clf, dtr_clf, ada_clf, gbrt_clf, xgb_clf
    ]

    voting_clf_ens_soft = VotingClassifier(estimators=[
        ('SGD Classifier', clfs[0]), ('Random Forests', clfs[1]),
        ('k-Nearest Neighbors', clfs[2]), ('Softmax Regression', clfs[3]),
        ('Decission Trees', clfs[4]), ('AdaBoost', clfs[5]),
        ('Gradient Boost', clfs[6]), ('XGBoost', clfs[7])
    ],
                                           voting='soft',
                                           n_jobs=-1)
    voting_clf_ens_soft.fit(X_train, y_train)

    cv_results = cross_validate(voting_clf_ens_soft,
                                X_train,
                                y_train,
                                cv=3,
                                scoring="accuracy")
    print(cv_results['test_score'].mean())

    # Although slower, it doesn't seem to be a better model than just Random Forests optimized alone, is it probably the soft voting? Let's see
    voting_clf_ens_hard = VotingClassifier(estimators=[
        ('SGD Classifier', clfs[0]), ('Random Forests', clfs[1]),
        ('k-Nearest Neighbors', clfs[2]), ('Softmax Regression', clfs[3]),
        ('Decission Trees', clfs[4]), ('AdaBoost', clfs[5]),
        ('Gradient Boost', clfs[6]), ('XGBoost', clfs[7])
    ],
                                           voting='hard',
                                           n_jobs=-1)
    voting_clf_ens_hard.fit(X_train, y_train)

    cv_results = cross_validate(voting_clf_ens_hard,
                                X_train,
                                y_train,
                                cv=3,
                                scoring="accuracy")
    print(cv_results['test_score'].mean())
    # Doesn't change much.

    # Stacking
    # Let's create a new model that decides the final label in a new second layer, taking as input the results of all the previous models.
    print(X_train.shape)
    idx = np.random.permutation(len(X_train))  # create shuffle index

    ## split into three sets
    # training set
    Xtr = X_train[idx[:33000]]
    ytr = y_train[idx[:33000]]
    # validation set
    Xvl = X_train[idx[33000:46200]]
    yvl = y_train[idx[33000:46200]]
    # test set
    Xts = X_train[idx[46200:]]
    yts = y_train[idx[46200:]]

    print(Xtr.shape, Xvl.shape, Xts.shape)
    for i, clf in enumerate(clfs):
        clf.fit(Xtr, ytr)
        print("Fitted {}/{}".format(i + 1, len(clfs)))

    # run individual classifiers on val set
    yhat = {}
    for i, clf in enumerate(clfs):
        yhat[i] = clf.predict(Xvl)
        print("Predicted {}/{}".format(i + 1, len(clfs)))

    # create new training set from predictions
    # combine the predictions into vectors using a horizontal stacking
    Xblend = np.c_[[preds for preds in yhat.values()]].T

    #Transform labels into codes
    le = preprocessing.LabelEncoder()
    Xblend = le.fit_transform(Xblend.reshape(13200 * 8)).reshape(13200, 8)

    # train a random forest classifier on Xblend using yvl for target labels
    rf_blend = RandomForestClassifier(n_estimators=100, n_jobs=-1)
    rf_blend.fit(Xblend, yvl)

    cv_results = cross_validate(rf_blend,
                                Xblend,
                                yvl,
                                cv=3,
                                scoring="accuracy")
    print(cv_results['test_score'].mean())

    # Let's see how this behaves with an unseen dataset
    # run individual classifiers on test set
    yhatts = {}
    for i, clf in enumerate(clfs):
        yhatts[i] = clf.predict(Xts)
        print("Predicted {}/{}".format(i + 1, len(clfs)))

    Xblendts = np.c_[[preds for preds in yhatts.values()]].T

    Xblendts = le.transform(Xblendts.reshape(13200 * 8)).reshape(13200, 8)

    cv_results = cross_validate(rf_blend,
                                Xblendts,
                                yts,
                                cv=3,
                                scoring="accuracy")
    print(cv_results['test_score'].mean())

    # Finally, in this exercise, nothing beats Random Forests and XGBoost.

    # Ensembling RF and XGB
    rnd_clf = RandomForestClassifier(**grid_search_rf.best_params_)
    xgb_clf = xgb.XGBClassifier(**grid_search_xgb.best_params_)

    clfs = [rnd_clf, xgb_clf]
    voting_clf_ens_rfxgb = VotingClassifier(estimators=[('Random Forests',
                                                         clfs[0]),
                                                        ('XGBoost', clfs[1])],
                                            voting='soft',
                                            n_jobs=-1)
    voting_clf_ens_rfxgb.fit(X_train, y_train)

    cv_results = cross_validate(voting_clf_ens_rfxgb,
                                X_train,
                                y_train,
                                cv=3,
                                scoring="accuracy")
    print(cv_results['test_score'].mean())
    # This is the best result so far!

    # Stacking RF and XGB
    # We have to be specially careful here to not overfit the RF classifier.
    idx = np.random.permutation(len(X_train))  # create shuffle index

    ## split into three sets
    # training set
    Xtr = X_train[idx[:33000]]
    ytr = y_train[idx[:33000]]
    # validation set
    Xvl = X_train[idx[33000:46200]]
    yvl = y_train[idx[33000:46200]]
    # test set
    Xts = X_train[idx[46200:]]
    yts = y_train[idx[46200:]]

    print(Xtr.shape, Xvl.shape, Xts.shape)

    for i, clf in enumerate(clfs):
        clf.fit(Xtr, ytr)
        print("Fitted {}/{}".format(i + 1, len(clfs)))

    # run individual classifiers on val set
    yhat = {}
    for i, clf in enumerate(clfs):
        yhat[i] = clf.predict(Xvl)
        print("Predicted {}/{}".format(i + 1, len(clfs)))

    # create new training set from predictions
    # combine the predictions into vectors using a horizontal stacking
    Xblend = np.c_[[preds for preds in yhat.values()]].T

    #Transform labels into codes
    le = preprocessing.LabelEncoder()
    Xblend = le.fit_transform(Xblend.reshape(13200 * 2)).reshape(13200, 2)

    # train a random forest classifier on Xblend using yvl for target labels
    rf_blend = RandomForestClassifier(n_estimators=300, n_jobs=-1)
    rf_blend.fit(Xblend, yvl)

    cv_results = cross_validate(rf_blend,
                                Xblend,
                                yvl,
                                cv=3,
                                scoring="accuracy")
    print(cv_results['test_score'].mean())

    # Let's see how this behaves with an unseen dataset
    # run individual classifiers on test set
    yhatts = {}
    for i, clf in enumerate(clfs):
        yhatts[i] = clf.predict(Xts)
        print("Predicted {}/{}".format(i + 1, len(clfs)))

    Xblendts = np.c_[[preds for preds in yhatts.values()]].T

    Xblendts = le.transform(Xblendts.reshape(13200 * 2)).reshape(13200, 2)

    cv_results = cross_validate(rf_blend,
                                Xblendts,
                                yts,
                                cv=3,
                                scoring="accuracy")
    print(cv_results['test_score'].mean())

    # Finally, it seems that the best result were obtained with an RF and XGBoost ensemble. Let's use this model to make the final predictions and submission file creation.
    return voting_clf_ens_rfxgb