예제 #1
0
def get_test(dim=128,maxlen=500,name='test.csv',events=None):
    X_train = pd.read_csv(path+name,
                    dtype={'device_id': np.str})
    X_train["app_lab"] = X_train["device_id"].map(events)
    X_train.fillna('0 ',inplace=True)
    x_train = X_train["app_lab"].values

    phone_brand_device_model = pd.read_csv(path+'phone_brand_device_model.csv',
                    dtype={'device_id': np.str})
    phone_brand_device_model.drop_duplicates('device_id', keep='first', inplace=True)

    phone_brand_le = LabelEncoder()
    phone_brand_device_model['phone_brand'] = phone_brand_le.fit_transform(phone_brand_device_model['phone_brand'])

    device_model_le = LabelEncoder()
    phone_brand_device_model['device_model'] = phone_brand_le.fit_transform(phone_brand_device_model['device_model'])


    X_train = pd.merge(X_train,phone_brand_device_model,how='left',on='device_id', left_index=True)
    X_train.fillna(0,inplace=True)
    phone_brand = X_train['phone_brand'].values
    device_model = X_train['device_model'].values

    x_train = [ x.split(' ') for x in  x_train]
    for i in range(len(x_train)):
        x_train[i] = [ np.int8(idx) for idx in x_train[i] if (idx!='nan' and idx!='')]

    x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
    x_train = [x_train,phone_brand,device_model]
    return x_train
예제 #2
0
def load_data():
    train_list = []
    for line in open('../data/train_clean.json', 'r'):
        train_list.append(json.loads(line))
    train = pd.DataFrame(train_list)
    
    #train_work = train[names[-1]]
    test_list = []
    for line in open('../data/test_clean.json', 'r'):
        test_list.append(json.loads(line))
    test = pd.DataFrame(test_list)
    
    print('--- NLP on major, simply cut the first word')
    le = LabelEncoder()
    print len(set(train['major']))
    train['major'] = train['major'].apply(lambda x :  " ".join(jieba.cut(x, cut_all = False)).split()[0] if x is not None and len(" ".join(jieba.cut(x)).split()) > 0 else 'none')
    test['major']  = test['major'].apply(lambda x :  " ".join(jieba.cut(x,  cut_all = False)).split()[0] if x is not None  and len(" ".join(jieba.cut(x)).split()) > 0 else 'none')

    print len(set(train['major']))
    le.fit(list(train['major']) + list(test['major']))
    train['major'] = le.transform(train['major'])
    test['major'] = le.transform(test['major'])
 
    le = LabelEncoder()
    train['gender'] = le.fit_transform(train['gender'])
    names =  train.columns
    
    le = LabelEncoder()
    test['gender'] = le.fit_transform(test['gender'])
    del train['_id']
    del test['_id']
    train = train.fillna(0)
    test = test.fillna(0)
    #test['age'] = test['age'].apply(lambda x : int(x.replace(u'岁','').encode('ascii')))
    return train, test
예제 #3
0
def label_encode_train_test_sets (train, test) :
	" Label encode 'supplier' and 'bracket_pricing' features for both train and test set "
	test_suppliers = np.sort(pd.unique(test.supplier.ravel()))
	print ("Test suppliers shape & elements: ", test_suppliers.shape, test_suppliers)
	train_suppliers = np.sort(pd.unique(train.supplier.ravel()))
	print ("Train suppliers shape & elements: ", train_suppliers.shape, train_suppliers)
	
	## Merge 'supplier' for both datasets first because we want encoding to be consistent across both
	# http://docs.scipy.org/doc/numpy/reference/generated/numpy.sort.html
	supplier_ids = []
	supplier_ids.extend(train_suppliers)
	supplier_ids.extend(test_suppliers)
	supplier_ids = np.sort(np.unique(supplier_ids))
	print ("Merged supplier_ids.shape: ", supplier_ids.shape)
	# print ("supplier_ids.elements: ", supplier_ids)

	## Perform label encoding fit on the merged array and then individually transform for train and test sets
	print ("Performing label encoding on supplier column...")
	label_e = LabelEncoder()
	label_e.fit(supplier_ids)
	train['supplier'] = label_e.transform(train['supplier'])
	test['supplier'] = label_e.transform(test['supplier'])

	## Perform label encoding on 'bracket_pricing'
	print ("Performing label encoding on bracket_pricing column...")
	train['bracket_pricing'] = label_e.fit_transform(train['bracket_pricing'])
	test['bracket_pricing'] = label_e.fit_transform(test['bracket_pricing'])

	return train, test
예제 #4
0
def prepare_items_features(user_items_csv, out_dir):
    array = np.loadtxt(user_items_csv, delimiter='|',
            dtype=np.dtype(np.uint64))

    le = LabelEncoder()
    col1 = le.fit_transform(array[:, 1].T)
    col2 = le.fit_transform(array[:, 2].T)
    col3 = le.fit_transform(array[:, 3].T)
    col4 = le.fit_transform(array[:, 4].T)

    columns = np.array([col1, col2, col3, col4]).T
    enc = OneHotEncoder()
    print(array[:10])
    encoded = np.c_[array[:, 0], enc.fit_transform(columns).toarray()]
    print(encoded[:10])
    print(encoded.shape)

    user_id = encoded[0][0]
    rows = []
    current = np.zeros(encoded.shape[1]-1)
    for i in range(encoded.shape[0]):
        if encoded[i][0] != user_id:
            rows.append(np.concatenate([[user_id], current]))
            user_id = encoded[i][0]
            current = np.zeros(encoded.shape[1]-1)
        else:
            current = np.sum([current, encoded[i, 1:]], axis=0)
    rows.append(np.concatenate([[user_id], current]))

    array = np.array(rows)
    print(array.shape)

    # let's serialize array
    np.save(os.path.join(out_dir, "user_items"), array)
def process_raw_label():
    df = pd.DataFrame([
        ['green', 'M', 10.1, 'class1'],
        ['red', 'L', 13.5, 'class2'],
        ['blue', 'XL', 15.3, 'class1']
    ])
    df.columns = ['color', 'size', 'price', 'classlabel']
    print(df)
    size_mapping = {
        'XL': 3,
        'L': 2,
        'M': 1
    }
    df['size'] = df['size'].map(size_mapping)
    print(df)
    class_mapping = {label: idx for idx, label in enumerate(np.unique(df['classlabel']))}
    print(class_mapping)
    df['classlabel'] = df['classlabel'].map(class_mapping)
    print(df)
    # inv
    inv_class_mapping = {v: k for k, v in class_mapping.items()}
    df['classlabel'] = df['classlabel'].map(inv_class_mapping)
    print(df)
    class_le = LabelEncoder()
    y = class_le.fit_transform(df['classlabel'].values)
    print(y)
    x = df[['color', 'size', 'price']].values
    print(x)
    color_le = LabelEncoder()
    x[:, 0] = color_le.fit_transform(x[:, 0])
    print('label encoder\n', x)
    ohe = OneHotEncoder(categorical_features=[0], sparse=False)
    x = ohe.fit_transform(x)
    print(x)
    print(pd.get_dummies(df[['price', 'color', 'size']]))
    def train_test(self, X, y, X_test):
        """
        """
        le = LabelEncoder()
        id_123 = np.logical_or(np.logical_or(y==1, y==2), y==3)  
        y0 = np.zeros(len(y), dtype=np.int32)
        y0[id_123] = 1
        X0 = np.copy(X) 
        y0 = le.fit_transform(y0).astype(np.int32)
    
        X1 = X[id_123]
        y1 = y[id_123]
        y1 = le.fit_transform(y1).astype(np.int32)
    
        X2 = X[np.logical_not(id_123)]
        y2 = y[np.logical_not(id_123)]    
        y2 = le.fit_transform(y2).astype(np.int32)
        
        print 'working on nn0...'
        self.nn0.max_epochs = self.early_stopping0.best_valid_epoch
        self.nn0.verbose=0
        self.nn0.fit(X0, y0)
        y0_pred = self.nn0.predict_proba(X_test)
        
        print 'working on nn1...'
        self.nn1.max_epochs = self.early_stopping1.best_valid_epoch
        self.nn1.verbose=0
        self.nn1.fit(X1, y1)
        y1_pred = self.nn1.predict_proba(X_test)   
        
        print 'working on nn2...'
        self.nn2.max_epochs = self.early_stopping2.best_valid_epoch
        self.nn2.verbose=0        
        self.nn2.fit(X2, y2)
        y2_pred = self.nn2.predict_proba(X_test)
           
        y_pred = np.zeros((y0_pred.shape[0], 9))
        y_pred[:,0] = y0_pred[:,0]*y2_pred[:,0]
        y_pred[:,1] = y0_pred[:,1]*y1_pred[:,0]
        y_pred[:,2] = y0_pred[:,1]*y1_pred[:,1]
        y_pred[:,3] = y0_pred[:,1]*y1_pred[:,2]
        y_pred[:,4] = y0_pred[:,0]*y2_pred[:,1]
        y_pred[:,5] = y0_pred[:,0]*y2_pred[:,2]
        y_pred[:,6] = y0_pred[:,0]*y2_pred[:,3]
        y_pred[:,7] = y0_pred[:,0]*y2_pred[:,4]
        y_pred[:,8] = y0_pred[:,0]*y2_pred[:,5]
        yp0 = y_pred
        
        self.cal_clf.fit(X, y)        
        yp1 = self.cal_clf.predict_proba(X_test)
        y_pred = (yp0 + yp1)/2.        
        
        return y_pred       
        


        

        
예제 #7
0
파일: kdc.py 프로젝트: qianFX/final_project
 def label_encoding(self, x: pd.DataFrame, y: pd.DataFrame, services: list) -> (pd.DataFrame, pd.DataFrame):
     le = LabelEncoder()
     le = le.fit(services)
     x['service'] = le.transform(x['service'])
     for feature in ["protocol_type", "flag"]:
         x[feature] = le.fit_transform(x[feature])
     y = le.fit_transform(y)
     print(le.classes_)
     return x, y
예제 #8
0
def test_label_encoder_fit_transform():
    """Test fit_transform"""
    le = LabelEncoder()
    ret = le.fit_transform([1, 1, 4, 5, -1, 0])
    assert_array_equal(ret, [2, 2, 3, 4, 0, 1])

    le = LabelEncoder()
    ret = le.fit_transform(["paris", "paris", "tokyo", "amsterdam"])
    assert_array_equal(ret, [1, 1, 2, 0])
예제 #9
0
def X_train_generatetor_infinite(dim=128,maxlen=500,batch_size=128,name="X_train.csv",events=None):
    X_train = pd.read_csv(path+name)
    group_le = LabelEncoder()
    group_lb = LabelBinarizer()
    labels = group_le.fit_transform(X_train['group'].values)
    labels = group_lb.fit_transform(labels)
    del labels
    
    ##################
    #   Phone Brand
    ##################
    # print("# Read Phone Brand")
    phone_brand_device_model = pd.read_csv(path+'phone_brand_device_model.csv',
                    dtype={'device_id': np.str})
    phone_brand_device_model.drop_duplicates('device_id', keep='first', inplace=True)
    phone_brand_le = LabelEncoder()
    phone_brand_device_model['phone_brand'] = phone_brand_le.fit_transform(phone_brand_device_model['phone_brand'])

    device_model_le = LabelEncoder()
    phone_brand_device_model['device_model'] = phone_brand_le.fit_transform(phone_brand_device_model['device_model'])


    while 1:
        data = pd.read_csv(path+name,iterator=True,chunksize=batch_size,
                    dtype={'device_id': np.str})
        for X_train in data:
            X_train = pd.merge(X_train,phone_brand_device_model,how='left',on='device_id', left_index=True)
            phone_brand = X_train['phone_brand'].values
            device_model = X_train['device_model'].values


            X_train["app_lab"] = X_train["device_id"].map(events)
            y_train = X_train['group'].values
            
            X_train['gender'][X_train['gender']=='M']=1
            X_train['gender'][X_train['gender']=='F']=0

            y_train_gender = X_train['gender'].values
            y_train_age = X_train['age'].values
            # take log transformation
            y_train_age = np.log(y_train_age)

            X_train.fillna('0 ',inplace=True)
            y_train = group_le.transform(y_train)
            y_train = group_lb.transform(y_train)
            x_train = X_train["app_lab"].values
            x_train = [ x.split(' ') for x in  x_train]
            for i in range(len(x_train)):
                x_train[i] = [ np.int8(idx) for idx in x_train[i] if (idx!='nan' and idx!='')]

            x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
            
            x_train = [x_train,phone_brand,device_model]
            y_train = [y_train,y_train_gender,y_train_age]

            yield (x_train,y_train)
def process_data(trainDF, testDF):
	# 去除train,test中的无用列,并做数据合并
	trainDF.drop(['Descript', 'Resolution'], axis=1, inplace=True)
	testDF.drop(['Id'], axis=1, inplace=True)
	labels = trainDF['Category'].copy()
	y = trainDF['Category'].copy()	
	combi = pd.concat([trainDF.drop(['Category'], axis=1),  testDF])
	
	combi['Month'], combi['Day'], combi['Hour'] = zip(*combi['Dates'].apply(extract_time))
	combi.drop(['Dates'], axis=1, inplace=True)
	combi['intesect'] = combi['Address'].apply(lambda x: 1 if '/' in x else 0)
	combi['Wake'] = combi['Hour'].apply(lambda x: 1 if (int(x)>=8 and int(x)<=23) else 0)
	addresses = sorted(combi['Address'].unique())
	categories = sorted(trainDF['Category'].unique())
	addr_counts = combi.groupby('Address').size()
	cat_counts = trainDF.groupby('Category').size()
	addr_cat_counts = trainDF.groupby(['Address', 'Category']).size()
	# 使用counts learning方法对地址信息和分类结果进行特征提取, 可参考https://msdn.microsoft.com/en-us/library/azure/dn913056.aspx
	logoddsPA = {}
	logodds = {}
	PA = cat_counts/float(len(trainDF))
	default_logodds = np.log(PA/(1-PA))
	for addr in addresses:
		PA = addr_counts[addr]/float(len(combi))
		logoddsPA[addr] = np.log(PA/(1.0-PA))
		logodds[addr] = deepcopy(default_logodds)
		if addr in addr_cat_counts.keys():
			for cat in addr_cat_counts[addr].keys():
				if addr_cat_counts[addr][cat] >= 2 and addr_cat_counts[addr][cat] < addr_counts[addr]:
					PA = addr_cat_counts[addr][cat] / float(addr_counts[addr])
					logodds[addr][categories.index(cat)] = np.log(PA/(1.0-PA))
		logodds[addr] = pd.Series(logodds[addr])
		logodds[addr].index = range(len(categories))
	combi['LogoddsPA'] = combi['Address'].apply(lambda x: logoddsPA[x])
	logodds_features = combi['Address'].apply(lambda x: logodds[x])
	logodds_features.colums = ["logodds"+str(x) for x in range(len(categories))]
	combi_full = pd.concat([combi, logodds_features], axis=1)
	xy_scaler = StandardScaler()
	combi_full[['X', 'Y']] = xy_scaler.fit_transform(combi_full[['X', 'Y']])
	# 进行label encoding
	lbe = LabelEncoder()
	combi_full['DayOfWeek'] = lbe.fit_transform(combi_full['DayOfWeek'])
	combi_full['PdDistrict'] = lbe.fit_transform(combi_full['PdDistrict'])
	combi_full['Wake'] = combi_full['Hour'].apply(lambda x: 1 if (int(x)>=8 and int(x)<=23) else 0)
	combi_full["IsDup"]=pd.Series(combi_full.duplicated()|combi_full.duplicated(take_last=True)).apply(int)
	combi_full.drop(['Address'], axis=1, inplace=True)
	y = lbe.fit_transform(y)
	# 由于采用xgboost,可不对特征进行dummy处理
	#ohe = OneHotEncoder(categorical_features=[0, 1,4,5,6])
	#data = ohe.fit_transform(combi_full.values)
	train = combi_full.values[:878049, :]
	test = combi_full.values[878049:, :]

	return train, test, y, lbe.classes_
 def execute(self,data):
     print 'started label encoding step'
     le = LabelEncoder()
     output_array = le.fit_transform(data[self.column_list[0]])
     for i in range(1,len(self.column_list)):
         output_array=np.column_stack([output_array,le.fit_transform(data[self.column_list[i]])])
     otherCols = set(data.columns).difference(set(self.column_list))
     df1 = data[list(otherCols)]
     df2 = pd.DataFrame(output_array,columns=self.column_list)
     df1 = df1.join(df2,how='left')
     print 'finished label encoding step'
     return df1
예제 #12
0
def data_preprocess(df):
    dropLst = ['Unnamed: 0', 'STATION_NAME',
               'STATISTICAL_CODE_DESCRIPTION', 'CrimeCat']
    df['STREET'] = df['STREET'].apply(get_rid_num)
    df['ZIP'] = df['ZIP'].apply(int)
    le = LabelEncoder()
    df['STREET'] = le.fit_transform(df['STREET'])
    df['CITY'] = le.fit_transform(df['CITY'])
    feature_names = df.drop(dropLst, axis=1).columns
    X = df.drop(dropLst, axis=1).values
    y = df['CrimeCat'].values
    lb = LabelBinarizer()
    y = lb.fit_transform(y)
    return X, y, feature_names
예제 #13
0
    def fit_transform(self, dframe):
        """
        Fit label encoder and return encoded labels.

        Access individual column classes via indexing
        `self.all_classes_`

        Access individual column encoders via indexing
        `self.all_encoders_`

        Access individual column encoded labels via indexing
        `self.all_labels_`
        """
        df = dframe.copy()
        # if columns are provided, iterate through and get `classes_`
        if self.columns is not None:
            # ndarray to hold LabelEncoder().classes_ for each
            # column; should match the shape of specified `columns`
            self.all_classes_ = np.ndarray(shape=self.columns.shape,
                                           dtype=object)
            self.all_encoders_ = np.ndarray(shape=self.columns.shape,
                                            dtype=object)
            self.all_labels_ = np.ndarray(shape=self.columns.shape,
                                          dtype=object)
            for idx, column in enumerate(self.columns):
                # instantiate LabelEncoder
                le = LabelEncoder()
                # fit and transform labels in the column
                df.loc[:, column] =\
                    le.fit_transform(df.loc[:, column].values)
                # append the `classes_` to our ndarray container
                self.all_classes_[idx] = (column,
                                          np.array(le.classes_.tolist(),
                                                  dtype=object))
                self.all_encoders_[idx] = le
                self.all_labels_[idx] = le
        else:
            # no columns specified; assume all are to be encoded
            self.columns = df.iloc[:, :].columns
            self.all_classes_ = np.ndarray(shape=self.columns.shape,
                                           dtype=object)
            for idx, column in enumerate(self.columns):
                le = LabelEncoder()
                df.loc[:, column] = le.fit_transform(
                        df.loc[:, column].values)
                self.all_classes_[idx] = (column,
                                          np.array(le.classes_.tolist(),
                                                  dtype=object))
                self.all_encoders_[idx] = le
        return df
예제 #14
0
    def transformTestData(self, train_data, test_data):
        #Select the right features for both training and testing data
        X_train, y_train = self.__selectRelevantFeatures(train_data)
        X_test, y_test = self.__selectRelevantFeatures(test_data)

        #Transform categorical variables into integer labels
        martial_le = LabelEncoder()
        occupation_le = LabelEncoder()
        relationship_le = LabelEncoder()
        race_le = LabelEncoder()
        sex_le = LabelEncoder()
        transformers = [martial_le, occupation_le, relationship_le, race_le, sex_le]

        for i in range(len(transformers)):
            X_train[:,i] = transformers[i].fit_transform(X_train[:,i])
            X_test[:,i] = transformers[i].transform(X_test[:,i])

        #Dummy code categorical variables
        dummy_code = OneHotEncoder(categorical_features = range(5))
        X_train = dummy_code.fit_transform(X_train).toarray()
        X_test = dummy_code.transform(X_test).toarray()

        #Normalize all features
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        #Encode y
        class_le = LabelEncoder()
        y_train = class_le.fit_transform(y_train)
        y_test = class_le.transform(y_test)
        #print class_le.transform(["<=50K", ">50K"])

        return X_train, X_test, y_train, y_test
예제 #15
0
def load_otto_group():
    """
    Loads and returns several variables for the data set from Kaggle's Otto Group Product Classification competition.
    Link: https://www.kaggle.com/c/otto-group-product-classification-challenge

    Returns
    ----------
    data : array-like
        Pandas data frame containing the entire data set.

    X : array-like
        Training input samples.

    y : array-like
        Target values.
    """
    file_location = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data', 'otto_group.zip')
    z = ZipFile(file_location)
    data = pd.read_csv(z.open('train.csv'))
    data = data.set_index('id')

    # move the label to the first position
    cols = data.columns.tolist()
    cols = cols[-1:] + cols[0:-1]
    data = data[cols]

    X = data.iloc[:, 1:].values

    y = data.iloc[:, 0].values

    # transform the labels from strings to integers
    encoder = LabelEncoder()
    y = encoder.fit_transform(y)

    return data, X, y
예제 #16
0
def buildTreeClassifier(predictorColumns, structurestable = 'structures.csv',  targetcolumn = 'pointGroup', md = None):
    """
    Build a random forest-classifier model to predict some structure feature from compositional data.  Will return the model trained on all data, a confusion matrix calculated , and an average accuracy score. Also returns a label encoder object
    """
    df = pd.read_csv(structurestable)
    df = df.dropna()
    if('fracNobleGas' in df.columns):
        df = df[df['fracNobleGas'] <= 0]
    
    s = StandardScaler()
    le = LabelEncoder()
    
    X = s.fit_transform(df[predictorColumns].astype('float64'))
    y = le.fit_transform(df[targetcolumn].values)

    rfc = RandomForestClassifier(max_depth = md)
    acc = mean(cross_val_score(rfc, X, y))

    X_train, X_test, y_train, y_test = train_test_split(X,y)
    rfc.fit(X_train,y_train)
    y_predict = rfc.predict(X_test)
    cm = confusion_matrix(y_test, y_predict)
    
    cm = pd.DataFrame(cm, columns=le.classes_, index=le.classes_)

    rfc.fit(X, y)

    return rfc, cm, round(acc,2), le
예제 #17
0
def process_one_cell(df_train, df_test, grid_id, th):
    """
    Classification inside one grid cell.
    """
    # Working on df_train
    df_cell_train = df_train.loc[df_train.grid_cell == grid_id]
    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= th).values
    df_cell_train = df_cell_train.loc[mask]

    # Working on df_test
    df_cell_test = df_test.loc[df_test.grid_cell == grid_id]
    row_ids = df_cell_test.index

    # Preparing data
    le = LabelEncoder()
    y = le.fit_transform(df_cell_train.place_id.values)
    X = df_cell_train.drop(['place_id', 'grid_cell'], axis=1).values.astype(int)
    X_test = df_cell_test.drop(['grid_cell'], axis=1).values.astype(int)

    # Applying the classifier
    clf = KNeighborsClassifier(n_neighbors=conf['neighbours'], weights='distance',
                               metric='manhattan')
    clf.fit(X, y)
    y_pred = clf.predict_proba(X_test)
    pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:, ::-1][:, :3])
    return pred_labels, row_ids
def process_one_cell(df_cell_train, df_cell_test):
    
    #Working on df_train
    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= 8).values
    df_cell_train = df_cell_train.loc[mask]
    
    #Working on df_test
    row_ids = df_cell_test.index
    
    #Feature engineering on x and y
    df_cell_train.loc[:,'x'] *= 500.0
    df_cell_train.loc[:,'y'] *= 1000.0
    df_cell_test.loc[:,'x'] *= 500.0
    df_cell_test.loc[:,'y'] *= 1000.0
    
    #Preparing data
    le = LabelEncoder()
    y = le.fit_transform(df_cell_train.place_id.values)
    X = df_cell_train.drop(['place_id'], axis=1).values
    X_test = df_cell_test.values

    #Applying the classifier
    clf = KNeighborsClassifier(n_neighbors=36, weights=calculate_distance, 
                               metric='manhattan')
    clf.fit(X, y)
    y_pred = clf.predict_proba(X_test)
    pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3]) 
    
    return pred_labels, row_ids
예제 #19
0
def load_kernel_matrix(data_path='data', study='wl_kernel', verbose=True):
    """Loading already computed kernel matrix.
    Parameters:
    ---------
    data_path: string
        Path to the data folder.
    study: string
        Name of the folder containing the study, e.g. 'wl_kernel', which
        contains the WL kernel matrix.
    verbose: bool
    """
    path_k_matrix = os.path.join(data_path, 'precomputed_kernels',
                                 study, 'k_matrix.csv')
    path_cls = os.path.join(data_path, 'precomputed_kernels', study,
                            'class_labels.csv')

    K = np.loadtxt(path_k_matrix)
    y = np.loadtxt(path_cls)

    le = LabelEncoder()
    y = le.fit_transform(y)

    if verbose:
        print 'n_samples: %s, n_samples_by_class: (%s - %s)' % (len(y),
                                                                len(y[y == 0]),
                                                                len(y[y == 1]))

    return K, y
def load_train_data(path):
    print("Loading Train Data")
    df = pd.read_csv(path)
    
    
    # Remove line below to run locally - Be careful you need more than 8GB RAM 
    rows = np.random.choice(df.index.values, 40000)
    df = df.ix[rows]
    # df = df.sample(n=40000)
    # df = df.loc[df.index]
    
    labels = df.target

    df = df.drop('target',1)
    df = df.drop('ID',1)
    
    # Junk cols - Some feature engineering needed here
    df = df.fillna(-1)

    X = df.values.copy()
    
    np.random.shuffle(X)

    X = X.astype(np.float32)
    encoder = LabelEncoder()
    y = encoder.fit_transform(labels).astype(np.int32)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    return X, y, encoder, scaler
예제 #21
0
class Classifier(BaseEstimator):
    def __init__(self):
        self.label_encoder = LabelEncoder()
        self.scaler = StandardScaler()
        self.clf = None        
 
    def fit(self, X, y):        
        X = self.scaler.fit_transform(X.astype(np.float32))              
        y = self.label_encoder.fit_transform(y).astype(np.int32)
        dtrain = xgb.DMatrix( X, label=y.astype(np.float32))
        
        param = {'objective':'multi:softprob', 'eval_metric':'mlogloss'}
        param['nthread'] = 4
        param['num_class'] = 9
        param['colsample_bytree'] = 0.55
        param['subsample'] = 0.85
        param['gamma'] = 0.95
        param['min_child_weight'] = 3.0
        param['eta'] = 0.05
        param['max_depth'] = 12
        num_round = 400 # to be faster ??  
        #num_round = 820
        
        self.clf = xgb.train(param, dtrain, num_round)  
 
    def predict(self, X):
        X = self.scaler.transform(X.astype(np.float32))
        dtest = xgb.DMatrix(X)       
        label_index_array = np.argmax(self.clf.predict(dtest), axis=1)
        return self.label_encoder.inverse_transform(label_index_array)
 
    def predict_proba(self, X):
        X = self.scaler.transform(X.astype(np.float32))
        dtest = xgb.DMatrix(X)
        return self.clf.predict(dtest)
예제 #22
0
def main(X_fname, Y_fname, result_fname=None): 
    le = LabelEncoder()
    moves = pandas.read_csv(Y_fname, index_col=0)
    Y = moves.values.ravel()
    Y = le.fit_transform(Y)
    X = io.mmread(X_fname)
    print X.shape, Y.shape, len(le.classes_) 

    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33)

    xg_train = xgboost.DMatrix( X_train, label=y_train)
    xg_test = xgboost.DMatrix(X_test, label=y_test)

    param = {}
    # use softmax multi-class classification
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.002
    param['max_depth'] = 7
    param['nthread'] = 7
    param['num_class'] = len(le.classes_)
    param['eval_metric'] = 'merror'
    evals = [ (xg_train, 'train'), (xg_test, 'eval') ]

    # Train xgboost
    print "Training"
    t1 = time.time()
    bst = xgboost.train(param, xg_train, 500, evals, early_stopping_rounds=3)
    t2 = time.time()
    print t2-t1

    if result_fname is None:
        result_fname = str(datetime.now())

    bst.save_model("%s.bst"%result_fname)
def plot_model_decision_surface(clf, train_features, train_labels,
                                plot_step=0.02, cmap=plt.cm.RdYlBu,
                                markers=None, alphas=None, colors=None):
    
    if train_features.shape[1] != 2:
        raise ValueError("X_train should have exactly 2 columnns!")
    
    x_min, x_max = train_features[:, 0].min() - plot_step, train_features[:, 0].max() + plot_step
    y_min, y_max = train_features[:, 1].min() - plot_step, train_features[:, 1].max() + plot_step
    xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))

    clf_est = clone(clf)
    clf_est.fit(train_features,train_labels)
    if hasattr(clf_est, 'predict_proba'):
        Z = clf_est.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:,1]
    else:
        Z = clf_est.predict(np.c_[xx.ravel(), yy.ravel()])    
    Z = Z.reshape(xx.shape)
    cs = plt.contourf(xx, yy, Z, cmap=cmap)
    
    le = LabelEncoder()
    y_enc = le.fit_transform(train_labels)
    n_classes = len(le.classes_)
    plot_colors = ''.join(colors) if colors else [None] * n_classes
    label_names = le.classes_
    markers = markers if markers else [None] * n_classes
    alphas = alphas if alphas else [None] * n_classes
    for i, color in zip(range(n_classes), plot_colors):
        idx = np.where(y_enc == i)
        plt.scatter(train_features[idx, 0], train_features[idx, 1], c=color,
                    label=label_names[i], cmap=cmap, edgecolors='black', 
                    marker=markers[i], alpha=alphas[i])
    plt.legend()
    plt.show()
예제 #24
0
    def load_input_files(self, **kwargs):
        
        """
        Loads both files containing training data and data for prediction. 
        
        Encodes the target labels to integers. 
        
        In case the it is training data, it will return in the output args the 
        LabelEncoder used to encode the target labels to integers. We return it 
        instead of directly storing it, because it will be saved in case the training
        ends without errors.
        
        Inputs:
        - files_paths (string): path the input files.
        - training_data (bool): specifies whether the files containing training
        data or data for making predictions.
        
        Outputs:
        - LabelEncoder (LabelEncoder) (optional): Encodes the labels of the target
        variables to integers.
        
        """
        
        input_data = kwargs['input_data']
        input_files_dir = kwargs['input_files_dir']

        input_file_path = input_files_dir + input_data['database']
        df = pd.read_csv(input_file_path)
        
        training_data = kwargs.pop('training_data', False)
        
        # if we are loading training data, we have to assign an integer to each possible
        # target label in the dataset. We do it by fitting a LabelEncoder,
        if training_data:

            le = LabelEncoder()
            col_name = df.columns[4]
            df[col_name] = le.fit_transform(df[col_name])

            data = {}
            data['features'] = df[df.columns[0:4]].values
            data['targets'] = df[df.columns[4]].values

            self.feature_names = list(df.columns[0:4])
            self.target_name = df.columns[4]    

            out_args = {}
            out_args['LabelEncoder'] = le
            
            return data, out_args

        # if the data is for making predictions
        else:
            data = {}
            # ensure that the columns are in the correct order
            data['features'] = df[self.feature_names].values
            
            out_args = {}
        
            return data, out_args
예제 #25
0
def ml_target(dataset):
    """ Takes a dataset and retuns the target in a numpy.array ready for
    machine learning.
    Mainly transforms non-numerical variables(columns) to numbers.

    Parameters
    ----------
    copper.Dataset

    Returns
    -------
    (label_encoder, np.array)

    Notes
    -----
    If dataset has more than one variable with role=TARGET then the first one
    is selected.
    """
    cols = dataset.filter_cols(role=dataset.TARGET)
    assert len(cols) > 0, 'No target variables on Dataset'
    if len(cols) > 1:
        import warnings
        warnings.warn("Dataset contains more than one target, %s was choosed" % cols[0])

    if dataset[cols[0]].dtype in (np.int, np.float):
        return None, dataset[cols[0]].values
    else:
        le = LabelEncoder()
        encoded = le.fit_transform(dataset[cols[0]].values)
        return le, encoded
예제 #26
0
    def __call__(self, X_train, X_test, y_train, y_test):
        X = np.vstack([X_train, X_test])
        y = np.hstack([y_train, y_test])
        le = LabelEncoder()
        y = le.fit_transform(y)

        kmeans = KMeans(
            n_clusters=len(np.unique(y)),
            n_init=self.kmeans__n_init,
            random_state=self.random_state,
        )
        kmeans.fit(X)

        r = distance.cdist(kmeans.cluster_centers_, kmeans.cluster_centers_)
        h = np.exp(-r / (self.sig**2))

        N = confusion_matrix(y, kmeans.labels_)

        wN = np.zeros(h.shape)
        for l in range(wN.shape[0]):  # label
            for c in range(wN.shape[0]):  # cluster
                for j in range(wN.shape[0]):
                    wN[l, c] += h[l, c] * N[l, j]

        return wN.max(axis=0).sum() / wN.sum()
예제 #27
0
def test_multiclass_classifier_class_weight():
    """tests multiclass with classweights for each class"""
    alpha = .1
    n_samples = 20
    tol = .00001
    max_iter = 50
    class_weight = {0: .45, 1: .55, 2: .75}
    fit_intercept = True
    X, y = make_blobs(n_samples=n_samples, centers=3, random_state=0,
                      cluster_std=0.1)
    step_size = get_step_size(X, alpha, fit_intercept, classification=True)
    classes = np.unique(y)

    clf1 = LogisticRegression(solver='sag', C=1. / alpha / n_samples,
                              max_iter=max_iter, tol=tol, random_state=77,
                              fit_intercept=fit_intercept,
                              class_weight=class_weight)
    clf2 = clone(clf1)
    clf1.fit(X, y)
    clf2.fit(sp.csr_matrix(X), y)

    le = LabelEncoder()
    class_weight_ = compute_class_weight(class_weight, np.unique(y), y)
    sample_weight = class_weight_[le.fit_transform(y)]

    coef1 = []
    intercept1 = []
    coef2 = []
    intercept2 = []
    for cl in classes:
        y_encoded = np.ones(n_samples)
        y_encoded[y != cl] = -1

        spweights1, spintercept1 = sag_sparse(X, y_encoded, step_size, alpha,
                                              n_iter=max_iter, dloss=log_dloss,
                                              sample_weight=sample_weight)
        spweights2, spintercept2 = sag_sparse(X, y_encoded, step_size, alpha,
                                              n_iter=max_iter, dloss=log_dloss,
                                              sample_weight=sample_weight,
                                              sparse=True)
        coef1.append(spweights1)
        intercept1.append(spintercept1)
        coef2.append(spweights2)
        intercept2.append(spintercept2)

    coef1 = np.vstack(coef1)
    intercept1 = np.array(intercept1)
    coef2 = np.vstack(coef2)
    intercept2 = np.array(intercept2)

    for i, cl in enumerate(classes):
        assert_array_almost_equal(clf1.coef_[i].ravel(),
                                  coef1[i].ravel(),
                                  decimal=2)
        assert_almost_equal(clf1.intercept_[i], intercept1[i], decimal=1)

        assert_array_almost_equal(clf2.coef_[i].ravel(),
                                  coef2[i].ravel(),
                                  decimal=2)
        assert_almost_equal(clf2.intercept_[i], intercept2[i], decimal=1)
예제 #28
0
 def auto_alpha2num(self, col):
     from sklearn.preprocessing import LabelEncoder
     
     le = LabelEncoder()
     for i in col:
         self.df[i] = le.fit_transform(self.df[i])
     return 
예제 #29
0
def load_data(filename="Feat_normalized.csv") :
    '''
    Load training data from csv file.  Load labels from it.
    Return matrix, training labels, encoder for labels.
    http://blog.yhathq.com/posts/predicting-customer-churn-with-sklearn.html
    http://stackoverflow.com/questions/21589177/using-multiple-features-with-scikit-learn?rq=1

    Labels could just be the names? : http://stackoverflow.com/questions/13300160/non-integer-class-labels-scikit-learn?rq=1
    '''
    df = pd.read_csv(filename, index_col=0)
    lb = LabelEncoder()
    labels = lb.fit_transform((df.index.values))

    print ("labels: %s %s" %(type(labels),labels))
    features = df.values
    # labels = LabelEncoder.transform(np.asarray(df['labels'].values))
    'This could be done more elegantly. Check index num for later filtering!!'
    'TODO: Is pop needed? List of col.values??  '

    feature_names=df.columns.values  #No pop. (nd array, no labels index here)
    print("%s features: " % (len(feature_names)))

    # classes = label_encoder.transform(np.asarray(df['labels']))
    print('encoded labels: %s' % (set(labels)))
    # print("feature_names: %s" %(feature_names))
    return (features, labels, lb,feature_names)
예제 #30
0
def multicol_fit_transform(dframe, columns):

	if isinstance(columns, list):
		columns = np.array(columns)
	else:
		columns = columns

	encoder_dict = {}
	# columns are provided, iterate through and get `classes_`
	# ndarray to hold LabelEncoder().classes_ for each
	# column; should match the shape of specified `columns`
	all_classes_ = np.ndarray(shape=columns.shape, dtype=object)
	all_encoders_ = np.ndarray(shape=columns.shape, dtype=object)
	all_labels_ = np.ndarray(shape=columns.shape, dtype=object)
	for idx, column in enumerate(columns):
		# instantiate LabelEncoder
		le = LabelEncoder()
		# fit and transform labels in the column
		dframe.loc[:, column] = le.fit_transform(dframe.loc[:, column].values)
		encoder_dict[column] = le
		# append the `classes_` to our ndarray container
		all_classes_[idx] = (column, np.array(le.classes_.tolist(), dtype=object))
		all_encoders_[idx] = le
		all_labels_[idx] = le

	multicol_dict = {"encoder_dict":encoder_dict, "all_classes_":all_classes_,"all_encoders_":all_encoders_,"columns": columns}
	return dframe, multicol_dict
예제 #31
0
    def fit(self, X, y):
        """Fit Gaussian process classification model

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)
            Training data

        y : array-like, shape = (n_samples,)
            Target values, must be binary

        Returns
        -------
        self : returns an instance of self.
        """
        if self.kernel is None:  # Use an RBF kernel as default
            self.kernel_ = C(1.0, constant_value_bounds="fixed") \
                * RBF(1.0, length_scale_bounds="fixed")
        else:
            self.kernel_ = clone(self.kernel)

        self.rng = check_random_state(self.random_state)

        self.X_train_ = np.copy(X) if self.copy_X_train else X

        # Encode class labels and check that it is a binary classification
        # problem
        label_encoder = LabelEncoder()
        self.y_train_ = label_encoder.fit_transform(y)
        self.classes_ = label_encoder.classes_
        if self.classes_.size > 2:
            raise ValueError("%s supports only binary classification. "
                             "y contains classes %s" %
                             (self.__class__.__name__, self.classes_))
        elif self.classes_.size == 1:
            raise ValueError(
                "{0:s} requires 2 classes; got {1:d} class".format(
                    self.__class__.__name__, self.classes_.size))

        if self.optimizer is not None and self.kernel_.n_dims > 0:
            # Choose hyperparameters based on maximizing the log-marginal
            # likelihood (potentially starting from several initial values)
            def obj_func(theta, eval_gradient=True):
                if eval_gradient:
                    lml, grad = self.log_marginal_likelihood(
                        theta, eval_gradient=True)
                    return -lml, -grad
                else:
                    return -self.log_marginal_likelihood(theta)

            # First optimize starting from theta specified in kernel
            optima = [
                self._constrained_optimization(obj_func, self.kernel_.theta,
                                               self.kernel_.bounds)
            ]

            # Additional runs are performed from log-uniform chosen initial
            # theta
            if self.n_restarts_optimizer > 0:
                if not np.isfinite(self.kernel_.bounds).all():
                    raise ValueError(
                        "Multiple optimizer restarts (n_restarts_optimizer>0) "
                        "requires that all bounds are finite.")
                bounds = self.kernel_.bounds
                for iteration in range(self.n_restarts_optimizer):
                    theta_initial = np.exp(
                        self.rng.uniform(bounds[:, 0], bounds[:, 1]))
                    optima.append(
                        self._constrained_optimization(obj_func, theta_initial,
                                                       bounds))
            # Select result from run with minimal (negative) log-marginal
            # likelihood
            lml_values = list(map(itemgetter(1), optima))
            self.kernel_.theta = optima[np.argmin(lml_values)][0]
            self.log_marginal_likelihood_value_ = -np.min(lml_values)
        else:
            self.log_marginal_likelihood_value_ = \
                self.log_marginal_likelihood(self.kernel_.theta)

        # Precompute quantities required for predictions which are independent
        # of actual query points
        K = self.kernel_(self.X_train_)

        _, (self.pi_, self.W_sr_, self.L_, _, _) = \
            self._posterior_mode(K, return_temporaries=True)

        return self
X = dataset.iloc[:,:-1].values
Y = dataset.iloc[:,3].values

#take care to missing data
from sklearn.preprocessing import Imputer
# create imputer
imputer = Imputer(missing_values = 'NaN',strategy = 'median', axis = 0)
#fit imputer to matrix
imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])

#dummy variable for country name.
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
#labelencoder_X = LabelEncoder()
#X[:,0] = labelencoder_X.fit_transform(X[:,0])
onehotencoder = OneHotEncoder()
X = onehotencoder.fit_transform(X).toarray()

#encoding catoagorical variables for yes or no
labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y)

# split train and test set, 20% for test
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state = 42)

from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
예제 #33
0
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

data = pd.read_excel(r'../initial datasets/LabeledDataset.xlsx',
                     sheet_name='Scenario 2')
data = data.sample(frac=1, random_state=42)
data['commentText'] = data['commentText'].astype(str)
data['replies.commentText'] = data['replies.commentText'].astype(str)

# Replace empty cells in commentText with np.nan
data['commentText'].replace(['nan'], np.nan, inplace=True)
# replace NaN values in commentText with values in replies.commentText
data['commentText'] = data['commentText'].fillna(data['replies.commentText'])

# Encode labels (strings -> numbers)
encoder = LabelEncoder()
data['Label'] = encoder.fit_transform(data['Label'])

# Rename columns and select text and class info
data.rename(columns={'commentText': 'text', 'Label': 'class'}, inplace=True)
df = data[['text', 'class']]

df.to_excel('../feature datasets/ar/labels.xlsx', index_label="index")
# mapping ordinal features
size_mapping = {'XL': 3, 'L': 2,  'M': 1}
df['size'] = df['size'].map(size_mapping)
print(df)

class_mapping = {label: idx for idx, label in enumerate(np.unique(df['classLabel']))}
print(class_mapping)

df['classLabel'] = df['classLabel'].map(class_mapping)
print(df)

inv_class_mapping = {v: k for k, v in class_mapping.items()}
df['classLabel'] = df['classLabel'].map(inv_class_mapping)
print(df)

class_encoder = LabelEncoder()
y = class_encoder.fit_transform(df['classLabel'].values)
print(y)
print(class_encoder.inverse_transform(y))

x = df [['color', 'size', 'price']].values
class_encoder = LabelEncoder()
x[:, 0] = class_encoder.fit_transform(x[:, 0])
print(x)

# one-hot encoding
one_encoder = OneHotEncoder(categorical_features=[0])
print(one_encoder.fit_transform(x).toarray())
# this one-hot is more readable
print(pd.get_dummies(df[['price', 'color', 'size']]))
예제 #35
0
        label_list.append(item[1])

print('Features in Training Set: {}'.format(len(training_set)))
print('Invalid Features in Training set: {}'.format(
    len(training_set) - len(feature_list)))

X = np.array(feature_list)
# Fit a per-column scaler
X_scaler = StandardScaler().fit(X)
# Apply the scaler to X
X_train = X_scaler.transform(X)
y_train = np.array(label_list)

# Convert label strings to numerical encoding
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)

# Create classifier
clf = svm.SVC(kernel='linear')

# Set up 5-fold cross-validation
kf = model_selection.KFold(  #len(X_train),
    n_splits=5, shuffle=True, random_state=1)

# Perform cross-validation
scores = model_selection.cross_val_score(cv=kf,
                                         estimator=clf,
                                         X=X_train,
                                         y=y_train,
                                         scoring='accuracy')
print('Scores: ' + str(scores))
예제 #36
0
corpus = []
y = []
for i in content:
    analysis = TextBlob(i)
    if analysis.polarity == 0:
        y.append(0)
    elif analysis.polarity > 0:
        y.append(1)
    elif analysis.polarity < 0:
        y.append(-1)
y = pd.Series(y)
u = pd.Series(y)

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(content).toarray()
#y=dataset.iloc[:,1]
# Binarize the output
# Add noisy features to make the problem harder
random_state = np.random.RandomState(0)
#n_samples, n_features = X.shape
#X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]

# Feature Scaling
# Fitting Naive Bayes to the Training set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
예제 #37
0
dataset = pd.read_csv('datasets/Data.csv')
X = dataset.iloc[:,:-1].values
Y = dataset.iloc[:,3].values

# Handling the missing data
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = "NaN", strategy = "mean", axis = 0)
# compute mean
imputer = imputer.fit(X[:, 1:3])
# use mean to replace
X[:,1:3] = imputer.transform(X[:, 1:3])

# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X[ : , 0] = labelencoder_X.fit_transform(X[ : , 0])

# which column is the feature needed one-hot
onehotencoder = OneHotEncoder(categorical_features = [0])
X = onehotencoder.fit_transform(X).toarray()
labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y)

from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split( X , Y , test_size = 0.2, random_state = 0)

# the scaling is needed on the training set and test set
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)
예제 #38
0
               'creative_is_voicead','creative_is_js']
for i in bool_feature:
    data[i] = data[i].astype(int)
# label encoding
advert_feature = ['advert_id','adid','orderid','advert_industry_inner','advert_name',
                  'campaign_id','creative_id','creative_type','creative_tp_dnf','advert_industry_inner_0']
media_feature = ['app_cate_id','f_channel','app_id','inner_slot_id','app_paid','inner_slot_id_1']
content_feature = ['city', 'province', 'nnt', 'devtype','osv','os']
# 待处理 user_tags , make , model
label_feature = advert_feature + media_feature + content_feature
num_feature = ['creative_width','creative_height','hour','day']


label_enc = LabelEncoder()
for label in label_feature:
    data[label] = label_enc.fit_transform(data[label].astype(str))
print("labelencoding is finish")

onehot_feature = label_feature
predict = data[data['click'] == -1].drop('click',axis=1)
predice_click = predict[['instance_id']]
predice_click['predicted_score'] = 0

train_all = data[data['click'] != -1]
train_y = train_all.click.values
train_x = train_all.drop('click',axis = 1)
if os.path.exists(path + '/feature/base_train_csr1.npz') and True:
    base_train_csr = sparse.load_npz(path + '/feature/base_train_csr1.npz').tocsr().astype(bool)
    base_predict_csr = sparse.load_npz(path+ '/feature/base_predict_csr1.npz').tocsr().astype(bool)
else: 
    base_train_csr = sparse.csr_matrix((len(train_x), 0))
예제 #39
0
import matplotlib.pyplot as plt
import pandas as pd

# Importing dataset
dataset = pd.read_csv('Churn_Modelling.csv')
dataset

# In[2]:

X = dataset.iloc[:, 3:13].values
y = dataset.iloc[:, 13].values

# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])
onehotencoder = OneHotEncoder(categorical_features=[1])
X = onehotencoder.fit_transform(X).toarray()
X = X[:, 1:]  # Avoiding dummy variable trap!

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
예제 #40
0
파일: svm.py 프로젝트: zhufengli/TC1
# import data
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
sample = pd.read_csv('./sampleSubmission.csv')

features = train.loc[:, 'feat_1':'feat_93'].values
labels = train['target'].values

categories = [
    'Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6',
    'Class_7', 'Class_8', 'Class_9'
]

#transformer la label on one hot label
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(labels)

# Train SVM by using different kernel 'rbf','poly','linear'
clf = SVC(C=1.0, kernel='sigmoid', probability=True)
clf.fit(features, integer_encoded)

#prediction
test = pd.read_csv('./test.csv')
test_features = test.loc[:, 'feat_1':'feat_93'].values
predictions = clf.predict_proba(test_features)

#print (predictions)
#enc = OneHotEncoder()
#onehot=enc.fit_transform(predictions.reshape(-1, 1)).toarray()

# create submission file
# Importing the dataset
dataset = pd.read_csv('Salary_Classification.csv')

temp = dataset.values
print(temp)

features = dataset.iloc[:, :-1].values
print(features)

labels = dataset.iloc[:, -1].values
print(labels)

# Encoding categorical data
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
features[:, 0] = labelencoder.fit_transform(features[:, 0])

# One Hot Encoding
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder(categorical_features=[0])
features = onehotencoder.fit_transform(features).toarray()
print(features)

# Avoiding the Dummy Variable Trap
# dropping first column
features = features[:, 1:]
print(features)

# We are not performing Prediction today
# But we have to identify which is the most important columns
# For which we will use a new library of statsmodels and not sklearn
# Fare
d_train["Fare"] = d_train["Fare"].fillna(d_train.Fare.mean())
d_test["Fare"] = d_test["Fare"].fillna(d_test.Fare.mean())

# Age
d_train["Age"] = d_train["Age"].fillna(d_train.Age.mean())
d_test["Age"] = d_test["Age"].fillna(d_test.Age.mean())

d_train = d_train.drop(['PassengerId','Name','Ticket','Cabin', 'Parch','SibSp'], axis=1)
d_test = d_test.drop(['Name','Ticket','Cabin', 'Parch','SibSp'], axis=1)

from sklearn.preprocessing import LabelEncoder
LE=LabelEncoder()
labels = ['title']
for label in labels:
    d_train[label]=LE.fit_transform(d_train[label])
    d_test[label]=LE.fit_transform(d_test[label])
    

y_train = d_train["Survived"]
X_train = d_train.drop("Survived",axis=1)
X_test = d_test.drop(d_test.columns[[0,1]],axis=1).copy()

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.metrics import accuracy_score

K = 5
kf = KFold(n_splits=K, shuffle=True, random_state=1 )
예제 #43
0
def _cross_val_predict(estimator,
                       X,
                       y=None,
                       *,
                       groups=None,
                       cv=None,
                       n_jobs=None,
                       verbose=0,
                       fit_params=None,
                       pre_dispatch='2*n_jobs',
                       method='predict',
                       safe=True):
    """This is a fork from :meth:`~sklearn.model_selection.cross_val_predict` to allow for
    non-safe cloning of the models for each fold.

    Parameters
    ----------
    estimator : estimator object implementing 'fit' and 'predict'
        The object to use to fit the data.

    X : array-like of shape (n_samples, n_features)
        The data to fit. Can be, for example a list, or an array at least 2d.

    y : array-like of shape (n_samples,) or (n_samples, n_outputs), \
            default=None
        The target variable to try to predict in the case of
        supervised learning.

    groups : array-like of shape (n_samples,), default=None
        Group labels for the samples used while splitting the dataset into
        train/test set. Only used in conjunction with a "Group" :term:`cv`
        instance (e.g., :class:`GroupKFold`).

    cv : int, cross-validation generator or an iterable, default=None
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

        - None, to use the default 5-fold cross validation,
        - int, to specify the number of folds in a `(Stratified)KFold`,
        - CV splitter,
        - An iterable yielding (train, test) splits as arrays of indices.

        For int/None inputs, if the estimator is a classifier and ``y`` is
        either binary or multiclass, :class:`StratifiedKFold` is used. In all
        other cases, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validation strategies that can be used here.

        .. versionchanged:: 0.22
            ``cv`` default value if None changed from 3-fold to 5-fold.

    n_jobs : int, default=None
        The number of CPUs to use to do the computation.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    verbose : int, default=0
        The verbosity level.

    fit_params : dict, defualt=None
        Parameters to pass to the fit method of the estimator.

    pre_dispatch : int or str, default='2*n_jobs'
        Controls the number of jobs that get dispatched during parallel
        execution. Reducing this number can be useful to avoid an
        explosion of memory consumption when more jobs get dispatched
        than CPUs can process. This parameter can be:

            - None, in which case all the jobs are immediately
              created and spawned. Use this for lightweight and
              fast-running jobs, to avoid delays due to on-demand
              spawning of the jobs

            - An int, giving the exact number of total jobs that are
              spawned

            - A str, giving an expression as a function of n_jobs,
              as in '2*n_jobs'

    method : str, default='predict'
        Invokes the passed method name of the passed estimator. For
        method='predict_proba', the columns correspond to the classes
        in sorted order.

    safe : bool, default=True
        Whether to clone with safe option.

    Returns
    -------
    predictions : ndarray
        This is the result of calling ``method``
    """
    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    splits = list(cv.split(X, y, groups))

    test_indices = np.concatenate([test for _, test in splits])
    if not _check_is_permutation(test_indices, _num_samples(X)):
        raise ValueError('cross_val_predict only works for partitions')

    # If classification methods produce multiple columns of output,
    # we need to manually encode classes to ensure consistent column ordering.
    encode = method in [
        'decision_function', 'predict_proba', 'predict_log_proba'
    ] and y is not None
    if encode:
        y = np.asarray(y)
        if y.ndim == 1:
            le = LabelEncoder()
            y = le.fit_transform(y)
        elif y.ndim == 2:
            y_enc = np.zeros_like(y, dtype=int)
            for i_label in range(y.shape[1]):
                y_enc[:, i_label] = LabelEncoder().fit_transform(y[:, i_label])
            y = y_enc

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs,
                        verbose=verbose,
                        pre_dispatch=pre_dispatch)
    # TODO. The API of the private scikit-learn `_fit_and_predict` has changed
    # between 0.23.2 and 0.24. For this to work with <0.24, we need to add a
    # case analysis based on sklearn version.
    predictions = parallel(
        delayed(_fit_and_predict)(clone(estimator, safe=safe), X, y, train,
                                  test, verbose, fit_params, method)
        for train, test in splits)

    inv_test_indices = np.empty(len(test_indices), dtype=int)
    inv_test_indices[test_indices] = np.arange(len(test_indices))

    if sp.issparse(predictions[0]):
        predictions = sp.vstack(predictions, format=predictions[0].format)
    elif encode and isinstance(predictions[0], list):
        # `predictions` is a list of method outputs from each fold.
        # If each of those is also a list, then treat this as a
        # multioutput-multiclass task. We need to separately concatenate
        # the method outputs for each label into an `n_labels` long list.
        n_labels = y.shape[1]
        concat_pred = []
        for i_label in range(n_labels):
            label_preds = np.concatenate([p[i_label] for p in predictions])
            concat_pred.append(label_preds)
        predictions = concat_pred
    else:
        predictions = np.concatenate(predictions)

    if isinstance(predictions, list):
        return [p[inv_test_indices] for p in predictions]
    else:
        return predictions[inv_test_indices]
예제 #44
0
#print(A.x)
print(B.x)

class C(A, B):
    pass

print(C.x)


数据处理
    空数据 dropna fillna 
    属性转成数字
        手动转
        labelencoder
            from sklearn.preprocessing import LabelEncoder
            df['column'] = LabelEncoder.fit_transform(df.column)
        哑变量 
            pd.get_dummies  返回的df 和原df concat
    筛选一下,去掉异常值
    数据规范化
        from sklearn.preprocessing import MinMaxScaler, StandardScaler
        df2 = StandardScaler().fit_transform(df)
选择学习集
    X 特征  y 标签
    交叉验证
        from model_selection import train_test_split
        X_train, X_test, y_train, y_test = train_test_split(X,y)
选择模型
    from sklearn.naive_bayes import GaussianNB
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.neighbors import KNeighborsClassifier
print('total empty value item are',
      data.isnull().sum())  # sum of all colum null values

# drop purchased column from data sets

# features=features.drop(labels=features.columns[[7]],axis=1)#method one
# features.head()
features = data.iloc[:, 0:2].values  # method 2
labels = data.iloc[:, -1].values

dt = pd.DataFrame(
    features)  # juct for overhead view convert feature into data frame

label_encode = LabelEncoder()

first_encode = label_encode.fit_transform(features[:, 1])
features[:, 1] = first_encode
print(len(pd.unique(first_encode)))  # inique featues in fist code

# data[:,1].unique()

# print(pd.value_counts(pd.unique(data[:,1])))

one_hot_encode = OneHotEncoder()
first_encode = np.reshape(first_encode, newshape=(len(first_encode), 1))
hot_e = one_hot_encode.fit_transform(first_encode)
print(hot_e)

# you have to convert all string column into categorical before applying one hot encoding
sec_hot_code = OneHotEncoder(categorical_features=[1])
features = sec_hot_code.fit_transform(features).toarray()
예제 #46
0
def labels(tags):
    le = LabelEncoder()
    tags = le.fit_transform(tags)
    categories = len(np.unique(tags))
    y = np_utils.to_categorical(tags, categories)
    return y
예제 #47
0
                type=int,
                default=-1,
                help="# of jobs for k-NN")
args = vars(ap.parse_args())

print("[INFO] loading images...")
imagePaths = list(paths.list_images(args["dataset"]))

sp = SimplePreprocessor(32, 32)
sdl = SimpleDatasetLoader(preprocessors=[sp])
(data, labels) = sdl.load(imagePaths, verbose=500)
data = data.reshape((data.shape[0], 3072))

print("[INFO] features matrix: {:.1f}MB".format(data.nbytes / (1024 * 1000.0)))

le = LabelEncoder()
labels = le.fit_transform(labels)

(trainX, testX, trainY, testY) = train_test_split(data,
                                                  labels,
                                                  test_size=0.25,
                                                  random_state=42)

print("[INFO] evaluating k-NN classifier...")
model = KNeighborsClassifier(n_neighbors=args["neighbors"],
                             n_jobs=args["jobs"])
model.fit(trainX, trainY)
print(
    classification_report(testY,
                          model.predict(testX),
                          target_names=le.classes_))
#Importing Libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

####### DATA PREPROCESSING ########

# Importing Dataset
dataset = pd.read_csv("Churn_Modelling.csv")
x = dataset.iloc[:, 3:13].values
y = dataset.iloc[:, 13].values

# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_x_1 = LabelEncoder()
x[:, 1] = labelencoder_x_1.fit_transform(x[:, 1])
labelencoder_x_2 = LabelEncoder()
x[:, 2] = labelencoder_x_2.fit_transform(x[:, 2])
onehotencoder = OneHotEncoder(categorical_features=[1])
x = onehotencoder.fit_transform(x).toarray()
x = x[:, 1:]

# Splitting the dataset into training set and test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

#Feature Scaling
from sklearn.preprocessing import StandardScaler
@author: BrysDom
"""
#Importing libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#Importing dataset
dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,4].values

#Encoding categorical data
from sklearn.preprocessing  import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X[:,3]=labelencoder_X.fit_transform(X[:,3])
onehotencoder=OneHotEncoder(categorical_features=[3])
X = onehotencoder.fit_transform(X).toarray()

#Avoinding Dummy Variable Trap
X=X[:,1:]

#Splitting database into Training and Test set 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,random_state = 0)

##Feature scaling
"""from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)"""
예제 #50
0
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# Taking care of missing data
from sklearn.preprocessing import Imputer

imputer = Imputer(missing_values="NaN", strategy="mean", axis=0)
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
onehotencoder = OneHotEncoder(categorical_features=[0])
X = onehotencoder.fit_transform(X).toarray()

labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

# Create the training set and the test set
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

# Feature scaling
예제 #51
0
    image = cv2.resize(image, (IMAGE_DIMS[1], IMAGE_DIMS[0]))   
    image = img_to_array(image)
    data.append(image)

    #extract the class label from image path and update the labels list
    label = imagePath.split(os.path.sep)[-2]
    labels.append(label)

#scale the raw pixel intensities to the range [0,1]
data = np.array(data, dtype="float")/255.0
labels = np.array(labels)
print("[INFO] data matrix: {:.2f}MB".format(data.nbytes / (1024 * 1000.0))) 

# binarize the labels
lb = LabelEncoder()
labels = lb.fit_transform(labels)
labels = to_categorical(labels,2)

#partiton the data into training and testing splits using 80% for training and the remaining 20% for testing
(trainX, testX, trainY, testY) = train_test_split(data, labels, test_size=0.2, random_state=42)

#construct the image generator for data augmentation
aug = ImageDataGenerator(rotation_range=25, width_shift_range=0.2, height_shift_range=0.2, shear_range=0.2, zoom_range=0.2, horizontal_flip=True, fill_mode="nearest")

#initialize the model
print("[info] compiling the model...")
model = SmallerVGGNet.build(width=IMAGE_DIMS[1], height=IMAGE_DIMS[0], depth=IMAGE_DIMS[2], classes=len(lb.classes_))
opt = Adam(lr=INIT_LR, decay=INIT_LR/EPOCHS)
model.compile(loss="binary_crossentropy", optimizer=opt, metrics=["accuracy"])

#train model
        'Geography': Geography,
        'Gender': Gender,
        'Age': Age,
        'Tenure': Tenure,
        'Balance': Balance,
        'NumberofProducts': NumberofProducts,
        'HasCrcard': HasCrcard,
        'IsActiveMember': IsActiveMember,
        'EstimatedSalary': EstimatedSalary
    }
    features = pd.DataFrame(data, index=[0])
    return features


input_df = user_input_features()

objList = input_df.select_dtypes(include="object").columns

for feat in objList:
    input_df[feat] = le.fit_transform(input_df[feat].astype(str))

df = input_df[:1]

if st.button('Predict'):
    load_clf = pickle.load(open('predictionrfc.pkl', 'rb'))
    st.subheader('User Input features')
    st.write(df)
    prediction = load_clf.predict(df)
    skip = np.array(['Defaulter', 'Defaulter'])
    st.write(skip[prediction])
예제 #53
0
dataDF = pd.DataFrame()
dataDF['text'] = texts
dataDF['label'] = labels

print(dataDF.head())
print("total examples %s" % len(labels))

# split the dataset into training and test datasets
X_train, X_test, y_train, y_test = train_test_split(dataDF['text'],
                                                    dataDF['label'],
                                                    random_state=24,
                                                    test_size=0.2)

# encode the target variable
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)
onehotencoder = OneHotEncoder(sparse=False)
y_train = onehotencoder.fit_transform(y_train.reshape(-1, 1))
y_test = onehotencoder.fit_transform(y_test.reshape(-1, 1))

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(dataDF['text'])

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1

maxlen = 100
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
test[9].isna().sum()

test[10].isna().sum()

del test[0]

del test[7]

X = test.values

from sklearn.preprocessing import LabelEncoder

lab = LabelEncoder()

X[:, 0] = lab.fit_transform(X[:, 0])

X[:, 1] = lab.fit_transform(X[:, 1])

X[:, 4] = lab.fit_transform(X[:, 4])

X[:, -1] = lab.fit_transform(X[:, -1])

test2 = pd.DataFrame(X)

test2.isnull().sum()

#Handling 5th column which has multiple categorical values in one cell

df = test2
예제 #55
0
파일: train.py 프로젝트: 1105104230/WBR
    (trainLabels, testLabels) = (labels[:split], labels[split:])

    # create the training and testing bunches
    training = Bunch(name="training", data=trainData, target=trainLabels)
    testing = Bunch(name="testing", data=testData, target=testLabels)

    # return a tuple of the training, testing bunches, and original labels
    return (training, testing, labels)


(training, testing, names) = load_sunplusit_faces(facePath,
                                                  min_faces=faces_min,
                                                  test_size=test_size)

le = LabelEncoder()
le.fit_transform(training.target)

#recognizer = cv2.face.createLBPHFaceRecognizer(radius=2, neighbors=16, grid_x=8, grid_y=8)
recognizer = cv2.face.LBPHFaceRecognizer_create(radius=2,
                                                neighbors=16,
                                                grid_x=8,
                                                grid_y=8)

print("[INFO] training face recognizer...")
recognizer.train(training.data, le.transform(training.target))

predictions = []
confidence = []
# loop over the test data
for i in range(0, len(testing.data)):
    print("{} of {}".format(str(i), str(len(testing.data))))
예제 #56
0
import numpy as np

# 数据加载
#data = pd.read_csv('Mall_Customers.csv', encoding='gbk')
data = pd.read_csv('CarPrice_Assignment.csv')

train_x = data[[
    'car_ID', 'symboling', 'carlength', 'carwidth', 'carheight', 'curbweight',
    'enginesize', 'boreratio', 'stroke', 'compressionratio', 'horsepower',
    'peakrpm', 'citympg', 'highwaympg', 'price'
]]

# LabelEncoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_x['car_ID'] = le.fit_transform(train_x['car_ID'])

# 规范化到 [0,1] 空间
min_max_scaler = preprocessing.MinMaxScaler()
train_x = min_max_scaler.fit_transform(train_x)
pd.DataFrame(train_x).to_csv('temp.csv', index=False)
#print(train_x)

### 使用KMeans聚类
kmeans = KMeans(n_clusters=2)
kmeans.fit(train_x)
predict_y = kmeans.predict(train_x)
# 合并聚类结果,插入到原数据中
result = pd.concat((data, pd.DataFrame(predict_y)), axis=1)
result.rename({0: u'聚类结果'}, axis=1, inplace=True)
print(result)
예제 #57
0
# reviews_per_month: replace null with 0
df["reviews_per_month"] = df["reviews_per_month"].fillna(0)

# last_review data: convert it to numeric value
df["last_review"] = pd.to_datetime(df["last_review"],
                                   infer_datetime_format=True)
earliest_last_review = min(df["last_review"])
df["last_review"] = df["last_review"].fillna(earliest_last_review)
df["last_review"] = df["last_review"].apply(
    lambda review_date: review_date.toordinal(
    ) - earliest_last_review.toordinal())

# neighbourhood: label encoding
neighbourhood_encoder = LabelEncoder()
neighbourhood_labels = neighbourhood_encoder.fit_transform(df["neighbourhood"])
df["neighbourhood"] = neighbourhood_labels
# retain the mapping of neighbourhood and encoded values
# neighbourhood_dict = dict(zip(neighbourhood_encoder.classes_, range(len(neighbourhood_encoder.classes_))))

# room_type: label encoding
room_encoder = LabelEncoder()
room_labels = room_encoder.fit_transform(df["room_type"])
df["room_type"] = room_labels
# retain the mapping of room_type and encoded values
# room_dict = dict(zip(room_encoder.classes_, range(len(room_encoder.classes_))))

# convert feature to log(1 + feature)
df["price"] = np.log1p(df["price"])

#######################
a = pd.get_dummies(test_set[['flag','protocol_type']])
test_set = pd.concat([test_set,a],axis=1)
test_set=test_set.drop(['Unnamed: 0','flag','protocol_type'],axis=1)
#train_set['flag']=le.fit_transform(train_set['flag'])

#~/~

#exclude the 'label' attribute from set
test_set.drop(['label'],inplace=True,axis=1)
    
#feature scaling
test_set=test_set.astype(float)
from sklearn.preprocessing import MinMaxScaler
for each_column in test_set:
    test_set[each_column] = MinMaxScaler().fit_transform(test_set[each_column].values.reshape(len(test_set),-1))

#save to file
full_test_set = pd.concat([test_set,test_labels],axis=1)
full_test_set.to_csv('~/dataset/preprocessed_test_dos_kdd99.csv')    

#transform label to numeric representation
test_labels = le.fit_transform(test_labels)

#train model
#from sklearn.neighbors import KNeighborsClassifier
#clf=KNeighborsClassifier(n_neighbors=5,algorithm='ball_tree',leaf_size=500)
#from time import time
#t0=time()
#clf.fit(features,labels)
#tt=time()-t0
#print('classifier trained in {} seconds'.format(round(tt,3)))
#Exploring the Target Variable. Our target variable is the Member Type column
data['Member Type'].value_counts()

#Checking for missing values in any column/features
data.isnull().sum()

#Convert the Categorical Values to Numerical to allow us perform plotting
#import the library LabelEncoder
from sklearn.preprocessing import LabelEncoder
#Create a list with categorical predictors
cat_var = ['Start station', 'End station', 'Bike Number', 'Member Type']
#Initiate LabelEncoder
le = LabelEncoder()
#A for loop to transform the categorical values to numerical values
for n in cat_var:
    data[n] = le.fit_transform(data[n])

#Checking for the type of the predictors afterwards
data.dtypes

#Explore the relationship between duration and member type
data.plot(x='Duration', y='Member Type', style='*')
plt.title('Duration of Bike Use')
plt.xlabel('Duration')
plt.ylabel('Member Type')
plt.show()

#Explore the relationship between Start Station and member type
data.plot(x='Start station', y='Member Type', style='*')
plt.title('Start station by Member Type')
plt.xlabel('Start station')
    
    
#Class distrbution
sns.countplot(data['class'])

##Feature distribution
for i in data.columns[:-1]:
    plt.figure(figsize=(12,6))
    plt.title("For feature '%s'"%i)
    sns.countplot(data[i],hue=data['class'])
    
#Modelling starts here with label encoding
le=LabelEncoder()

for i in data.columns:
    data[i]=le.fit_transform(data[i])
    
    
##X and Y variables
X=data[data.columns[:-1]]
y=data['class']

##Train test split and building the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=22)

logreg=LogisticRegression(solver='newton-cg',multi_class='multinomial')


logreg.fit(X_train,y_train)

pred=logreg.predict(X_test)