예제 #1
0
def process_one_cell(df_train, df_test, grid_id, th):
    """
    Classification inside one grid cell.
    """
    # Working on df_train
    df_cell_train = df_train.loc[df_train.grid_cell == grid_id]
    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= th).values
    df_cell_train = df_cell_train.loc[mask]

    # Working on df_test
    df_cell_test = df_test.loc[df_test.grid_cell == grid_id]
    row_ids = df_cell_test.index

    # Preparing data
    le = LabelEncoder()
    y = le.fit_transform(df_cell_train.place_id.values)
    X = df_cell_train.drop(['place_id', 'grid_cell'], axis=1).values.astype(int)
    X_test = df_cell_test.drop(['grid_cell'], axis=1).values.astype(int)

    # Applying the classifier
    clf = KNeighborsClassifier(n_neighbors=conf['neighbours'], weights='distance',
                               metric='manhattan')
    clf.fit(X, y)
    y_pred = clf.predict_proba(X_test)
    pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:, ::-1][:, :3])
    return pred_labels, row_ids
예제 #2
0
파일: module4_knn.py 프로젝트: mircean/ML
def process_one_cell(df_cell_train, df_cell_test):
    
    #Working on df_train
    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= 5).values
    df_cell_train = df_cell_train.loc[mask]
    
    #Working on df_test
    row_ids = df_cell_test.index
    
    #Feature engineering on x and y
    df_cell_train.loc[:,'x'] *= 462.0
    df_cell_train.loc[:,'y'] *= 975.0
    df_cell_test.loc[:,'x'] *= 462.0
    df_cell_test.loc[:,'y'] *= 975.0

    #Preparing data
    le = LabelEncoder()
    y = le.fit_transform(df_cell_train.place_id.values)
    X = df_cell_train.drop(['place_id'], axis=1).values
    
    #Applying the classifier, ct = 5.3 #5.1282
    clf = KNeighborsClassifier(n_neighbors=np.floor(np.sqrt(y.size)/5.2).astype(int), 
                            weights=calculate_distance,metric='manhattan',n_jobs=2)
    clf.fit(X, y)
    y_pred = clf.predict_proba(df_cell_test.values)
    ##1
    #pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3]) 
    pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:n_topx]) 
    
    return pred_labels, row_ids
 class labelOnehotEnc():
     def __init__(self):
         self.le = LabelEncoder()
         self.oe = OneHotEncoder(sparse=False)   
     def label_fit(self,x):
         feature = self.le.fit_transform(x)
         self.oe = OneHotEncoder(sparse=False)
         return self.oe.fit_transform(feature.reshape(-1,1))
     def onehot_inverse(self,x):
         self.indecies = []
         for t in range(len(x)):
             ind = np.argmax((x[t]))
             self.indecies.append(ind)
         return self.le.inverse_transform(self.indecies)
     def inverse_label(self,x):
         return self.le.inverse_transform(x)
def test_hard_vote():
    X,y,test_X,test_Y =get_test_data()

    print("bag of words")
    bow = BagOfWordsClassifier()
    bow_probs = bow.get_proba(X,y,test_X,prefix="t")

    print("direct attribute")
    da = DirectAttributeClassifier()
    da_probs = da.get_proba(X,y,test_X,prefix="t")

    probs = zip(*[item for p in [bow_probs,da_probs] for item in p])
    #train_probs = probs[0]
    test_probs = probs[1]
    print(len(test_probs))
    preds = [x.idxmax(1) for x in test_probs]
    pred = np.zeros(len(preds[0]),dtype=np.int8)
    print(len(pred))
    for i in range(len(preds[0])):
        votes = [p[i] for p in preds]
        print(votes)
        pred[i]= max(set(votes),key=votes.count)
        print(pred[i])
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    le.fit(y)
    pred = le.inverse_transform(pred)

    print(metrics.accuracy_score(test_Y,pred))

    """
예제 #5
0
    def test_same_inverse_transform(self):
        Y, Y_rdd = self.make_dense_randint_rdd(low=0, high=10, shape=(1000,))

        local = LabelEncoder().fit(Y)
        dist = SparkLabelEncoder().fit(Y_rdd)

        assert_array_equal(local.inverse_transform(Y), dist.inverse_transform(Y_rdd).toarray())
예제 #6
0
class Classifier(BaseEstimator):
    def __init__(self):
        self.label_encoder = LabelEncoder()
        self.scaler = StandardScaler()
        self.clf = None        
 
    def fit(self, X, y):        
        X = self.scaler.fit_transform(X.astype(np.float32))              
        y = self.label_encoder.fit_transform(y).astype(np.int32)
        dtrain = xgb.DMatrix( X, label=y.astype(np.float32))
        
        param = {'objective':'multi:softprob', 'eval_metric':'mlogloss'}
        param['nthread'] = 4
        param['num_class'] = 9
        param['colsample_bytree'] = 0.55
        param['subsample'] = 0.85
        param['gamma'] = 0.95
        param['min_child_weight'] = 3.0
        param['eta'] = 0.05
        param['max_depth'] = 12
        num_round = 400 # to be faster ??  
        #num_round = 820
        
        self.clf = xgb.train(param, dtrain, num_round)  
 
    def predict(self, X):
        X = self.scaler.transform(X.astype(np.float32))
        dtest = xgb.DMatrix(X)       
        label_index_array = np.argmax(self.clf.predict(dtest), axis=1)
        return self.label_encoder.inverse_transform(label_index_array)
 
    def predict_proba(self, X):
        X = self.scaler.transform(X.astype(np.float32))
        dtest = xgb.DMatrix(X)
        return self.clf.predict(dtest)
def process_one_cell(df_cell_train, df_cell_test):
    
    #Working on df_train
    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= 8).values
    df_cell_train = df_cell_train.loc[mask]
    
    #Working on df_test
    row_ids = df_cell_test.index
    
    #Feature engineering on x and y
    df_cell_train.loc[:,'x'] *= 500.0
    df_cell_train.loc[:,'y'] *= 1000.0
    df_cell_test.loc[:,'x'] *= 500.0
    df_cell_test.loc[:,'y'] *= 1000.0
    
    #Preparing data
    le = LabelEncoder()
    y = le.fit_transform(df_cell_train.place_id.values)
    X = df_cell_train.drop(['place_id'], axis=1).values
    X_test = df_cell_test.values

    #Applying the classifier
    clf = KNeighborsClassifier(n_neighbors=36, weights=calculate_distance, 
                               metric='manhattan')
    clf.fit(X, y)
    y_pred = clf.predict_proba(X_test)
    pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3]) 
    
    return pred_labels, row_ids
def test_vote_soft():
    X,y,test_X,test_Y =get_test_data()

    print("bag of words")
    bow = BagOfWordsClassifier()
    bow_probs = bow.get_proba(X,y,test_X,prefix="t")

    print("direct attribute")
    da = DirectAttributeClassifier()
    da_probs = da.get_proba(X,y,test_X,prefix="t")

    probs = zip(*[item for p in [bow_probs,da_probs] for item in p])
    train_probs = probs[0]
    test_probs = probs[1]
    print(len(train_probs))
    for prob in train_probs:
        print(prob.shape)
        print(type(prob))
    #train_attr = reduce(lambda a,b:a+b,train_probs)
    test_attr = reduce(lambda a,b:a+b,test_probs)

    pred = test_attr.idxmax(1)
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    le.fit(y)
    pred = le.inverse_transform(pred)

    print(metrics.accuracy_score(test_Y,pred))
예제 #9
0
def process_1_grid(df_train, df_test, grid, threshold):

	# Creating data with the particular grid id.
	df_train_1_grid = df_train.loc[df_train.grid_num == grid]
	df_test_1_grid = df_test.loc[df_test.grid_num == grid]
	place_counts = df_train_1_grid.place_id.value_counts()
	mask = (place_counts[df_train_1_grid.place_id.values] >= threshold).values
	df_train_1_grid = df_train_1_grid.loc[mask]
	# Label Encoding
	le = LabelEncoder()
	labels = le.fit_transform(df_train_1_grid.place_id.values)
	
	# Computing train and test feature data for grid grid.
	X = df_train_1_grid.drop(['place_id','grid_num'], axis=1).values.astype(int)
	X_test = df_test_1_grid.drop(['grid_num'], axis=1).values.astype(int)
	row_id = df_test_1_grid.index
	
	# KNN Classifier 
	clf = KNeighborsClassifier(n_neighbors=20, weights= 'distance', metric='manhattan')
	#clf = GaussianNB()
	# Training of the classifier
	#clf = XGBClassifier(max_depth=10, learning_rate=0.5, n_estimators=25,objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)                  
	clf.fit(X,labels)

	
	# Predicting probabilities for each of the label for test data.
	prob_y = clf.predict_proba(X_test)
	
	# Transforming back to labels from One hot Encoding
	pred_labels = le.inverse_transform(np.argsort(prob_y, axis=1)[:,::-1][:,:3])
	return pred_labels, row_id
예제 #10
0
def one_partition_NDCG(x ,labels ,model ,i ,factor):
    le = LabelEncoder()
    y = le.fit_transform(labels)   
    piv_train = x.shape[0]
    trans_x = []
    trans_y = []
    test_x = []
    test_y = []
    if i == 0:
        trans_x = x[(i+1)*factor:] 
        trans_y = y[(i+1)*factor:] 
        test_x = x[:(i+1)*factor]
        test_y = y[:(i+1)*factor]
    elif i+1 == piv_train/factor:
        trans_x = x[:i*factor] 
        trans_y = y[:i*factor] 
        test_x = x[i*factor:]
        test_y = y[i*factor:]
    else:
        trans_x = np.concatenate((x[:i*factor],x[(i+1)*factor:]))
        trans_y = np.concatenate((y[:i*factor],y[(i+1)*factor:]))
        test_x = x[i*factor:(i+1)*factor]
        test_y = y[i*factor:(i+1)*factor]
    model.fit(trans_x,trans_y)
    y_pred = model.predict_proba(test_x)
    ids = []  
    cts = []  
    for j in range(factor):
        cts += [le.inverse_transform(np.argsort(y_pred[j])[::-1])[:5].tolist()]
    preds = pd.DataFrame(cts)
    truth = pd.Series(labels[i*factor:(i+1)*factor])
    #truth = pd.Series(le.inverse_transform(test_y).tolist())
    return mean_NDCG(preds, truth)
예제 #11
0
def process_one_cell(df_train, df_test, grid_id, th):
    """   
    Classification inside one grid cell.
    """   
    #Working on df_train
    df_cell_train = df_train.loc[df_train.grid_cell == grid_id]
    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= th).values
    df_cell_train = df_cell_train.loc[mask]

    #Working on df_test
    df_cell_test = df_test.loc[df_test.grid_cell == grid_id]
    row_ids = df_cell_test.index
    
    #Preparing data
    le = LabelEncoder()
    y = le.fit_transform(df_cell_train.place_id.values)
    X = df_cell_train.drop(['place_id', 'grid_cell'], axis=1).values.astype(int)
    X_test = df_cell_test.drop(['grid_cell'], axis = 1).values.astype(int)
    
    #Applying the classifier
    X_ = xgb.DMatrix(X, label=y)
    X_t = xgb.DMatrix(X_test)
    boost = xgb.train({'eta': 0.1, 'objective': 'multi:softprob', 'num_class': len(le.classes_), 'alpha': 0.1, 'lambda': 0.1, 'booster': 'gbtree'},
                      X_, num_boost_round = 75)
    y_pred = boost.predict(X_t)
    pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3])    
    return pred_labels, row_ids
def process_one_cell_df(train_cell, test_cell, g):
    """
    Return:
    ------    
    pred_labels: numpy ndarray
                 Array with the prediction of the top 3 labels for each sample
    row_ids: IDs of the samples in the submission dataframe 
    """   

    train = np.frombuffer(shared_train).reshape(train_x, train_y)
    test = np.frombuffer(shared_test).reshape(test_x, test_y)

    if (train_cell[0] >= train_cell[1]) | (test_cell[0] >= test_cell[1]):
        return None, None
    row_ids = test[test_cell[0]:test_cell[1], 0].astype(int)

    le = LabelEncoder()
    y = le.fit_transform(train[train_cell[0]:train_cell[1], 0])
    X = train[train_cell[0]:train_cell[1], 1:]

    clf = create_classifier(g.clf, y.size)
    clf.fit(X, y)
    
    X_test = test[test_cell[0]:test_cell[1], 1:]
    y_prob = clf.predict_proba(X_test)

    pred_y = np.argsort(y_prob, axis=1)[:,::-1][:,:g.top]
    pred_labels = le.inverse_transform(pred_y).astype(np.int64)
    
    labs = pd.DataFrame(pred_labels, index=row_ids)
    labs.index.name = "row_id"
    probs = pd.DataFrame(y_prob[np.arange(len(y_prob)).reshape(-1,1), pred_y], index=row_ids)
    probs.index.name = "row_id"
    
    return labs, probs
예제 #13
0
class Classifier(BaseEstimator):
    def __init__(self):
        self.label_encoder = LabelEncoder()
        self.scaler = StandardScaler()
        self.clf = None
        self.param = {'eval_metric':'mlogloss'}
        self.param['num_class'] = 9
        self.param['subsample'] = 0.795
        self.param['gamma'] = 0.9        
        self.num_round = 170
        self.obj = 'multi:softprob'
 
    def fit(self, X, y):        
        X = self.scaler.fit_transform(X.astype(np.float32))              
        y = self.label_encoder.fit_transform(y).astype(np.int32)
        dtrain = xgb.DMatrix( X, label=y.astype(np.float32))
        
        self.param['objective'] = self.obj  
        self.clf = xgb.train(self.param, dtrain, self.num_round)  
 
    def predict(self, X):
        X = self.scaler.transform(X.astype(np.float32))
        dtest = xgb.DMatrix(X)       
        label_index_array = np.argmax(self.clf.predict(dtest), axis=1)
        return self.label_encoder.inverse_transform(label_index_array)
 
    def predict_proba(self, X):
        X = self.scaler.transform(X.astype(np.float32))
        dtest = xgb.DMatrix(X)
        return self.clf.predict(dtest)
    def process_cell(self, df_cell_train, df_cell_test, window):

        place_counts = df_cell_train.place_id.value_counts()
        mask = (place_counts[df_cell_train.place_id.values] >= th).values
        df_cell_train = df_cell_train.loc[mask]

        # Working on df_test
        row_ids = df_cell_test.index

        # Preparing data
        le = LabelEncoder()
        y = le.fit_transform(df_cell_train.place_id.values)
        X = df_cell_train.drop(['place_id', ], axis=1).values.astype(int)
        X_test = df_cell_test.values.astype(int)

        # Applying the classifier
        clf1 = KNeighborsClassifier(n_neighbors=50, weights='distance',
                                    metric='manhattan')
        clf2 = RandomForestClassifier(n_estimators=50, n_jobs=-1)
        eclf = VotingClassifier(estimators=[('knn', clf1), ('rf', clf2)], voting='soft')

        eclf.fit(X, y)
        y_pred = eclf.predict_proba(X_test)
        pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:, ::-1][:, :3])
        return pred_labels, row_ids
예제 #15
0
class LogisticRegression:
    """
    Logistic regression.
    Minimize regularized log-loss:
        L(x, y|w) = - sum_i log p(y_i|x_i, w) + l2 ||w||^2
        p(y|x, w) = exp(w[y].x) / (sum_y' exp(w[y'].x))

    Parameters
    ----------
    l2: float, default=0
        L2 regularization strength
    """
    def __init__(self, l2=0):
        self.l2 = l2
        self.loss = LogisticLoss()

    def fit(self, X, y):
        self.label_encoder_ = LabelEncoder()
        y = self.label_encoder_.fit_transform(y).astype(numpy.int32)
        self.n_classes = len(numpy.unique(y))
        self.coef_ = numpy.zeros((X.shape[1] + 1) * (self.n_classes - 1), dtype=numpy.float64)
        dataset = IntegerDataset(X, y)
        self.loss.fit(dataset, self.coef_, self.l2)
        return self

    def predict(self, X):
        n_features = self.coef_.size/(self.n_classes - 1) - 1
        assert X.shape[1] == n_features
        return self.label_encoder_.inverse_transform(self.loss.predict(n_features, self.n_classes, self.coef_, X))

    def predict_proba(self, X):
        n_features = self.coef_.size/(self.n_classes - 1) - 1
        assert X.shape[1] == n_features
        return self.loss.predict_proba(n_features, self.n_classes, self.coef_, X)
예제 #16
0
class PipelineNet(NeuralNet): # By default Lasagne is super finicky with inputs and outputs. So I just handle most of the pre and postprocessing for you.
    def fit(self,X, y,**params):
        self.label_encoder = LabelEncoder()
        self.one_hot = OneHotEncoder()

        y = list(map(lambda x:[x],self.label_encoder.fit_transform(y)))
        y = np.array(self.one_hot.fit_transform(y).toarray(),dtype=np.float32)
        X = np.array(X,dtype=np.float32)

        self.output_num_units=len(y[0])
        self.input_shape=(None,X.shape[1])

        self.output_nonlinearity=lasagne.nonlinearities.softmax

        return NeuralNet.fit(self,X,y,**params)

    def predict(self, X):
        X = np.array(X,dtype=np.float32)
        preds = NeuralNet.predict(self,X)

        preds = np.argmax(preds,axis=1)
        preds = self.label_encoder.inverse_transform(preds)

        return preds

    def score(self, X, y):
        return sklearn.metrics.accuracy_score(self.predict(X),y)
def process_grid_cell(train, test, grid_id, threshold, model, grid_variable):
    """ Creates model and generates predictions for row_ids in a particular grid cell.
    """
    start = time.time()
    # Filter data onto single grid cell
    train_cell = train[train[grid_variable] == grid_id]
    test_cell = test[test[grid_variable] == grid_id]
    test_ids = test_cell.index
    
    # Remove place ids from train data with frequency below threshold
    place_counts = train_cell.place_id.value_counts()
    mask = place_counts[train_cell.place_id.values] >= threshold
    train_cell = train_cell.loc[mask.values]

    # Encode place id as labels
    le = LabelEncoder()
    y_train = le.fit_transform(train_cell.place_id.values)
    X_train = train_cell.drop(['place_id', grid_variable], axis = 1).values
    X_test = test_cell.drop([grid_variable], axis = 1).values.astype(int)
    
    # Build training classifier and predict
    model.fit(X_train, y_train)
    X_test = test_cell.drop([grid_variable], axis = 1).values
    y_pred = model.predict_proba(X_test)

    pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3])   
    end = time.time()
    time_elapsed = (end - start)
    
    # Return data
    return pred_labels, test_ids, time_elapsed
예제 #18
0
    def fit(self, df_X, df_y):
        if not df_y.shape[0] == df_X.shape[0]:
            raise ValueError("number of regions is not equal")
        if df_y.shape[1] != 1:
            raise ValueError("y needs to have 1 label column")

        le = LabelEncoder()
        y = le.fit_transform(df_y.iloc[:,0].values)

        clf = RandomForestClassifier(n_estimators=100)
        
        # Multiclass
        if len(le.classes_) > 2:
            orc = OneVsRestClassifier(clf)
            orc.fit(df_X.values, y)

            importances = np.array([c.feature_importances_ for c in orc.estimators_]).T
        else: # Only two classes
            clf.fit(df_X.values, y)
            importances = np.array([
                clf.feature_importances_,
                clf.feature_importances_
                ]).T
        
        for i,c in enumerate(le.classes_):
            
            diff = df_X.loc[y == c].quantile(q=0.75) - df_X.loc[y != c].quantile(q=0.75)
            sign = (diff >= 0) * 2 - 1
            importances[:,i] *= sign
        
        
        # create output DataFrame
        self.act_ = pd.DataFrame(importances,
                columns=le.inverse_transform(range(len(le.classes_))),
                index=df_X.columns)
예제 #19
0
def test_label_encoder():
    """Test LabelEncoder's transform and inverse_transform methods"""
    le = LabelEncoder()
    le.fit([1, 1, 4, 5, -1, 0])
    assert_array_equal(le.classes_, [-1, 0, 1, 4, 5])
    assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]), [1, 2, 3, 3, 4, 0, 0])
    assert_array_equal(le.inverse_transform([1, 2, 3, 3, 4, 0, 0]), [0, 1, 4, 4, 5, -1, -1])
    assert_raises(ValueError, le.transform, [0, 6])
예제 #20
0
class EnsembleClassifier(BaseEstimator, ClassifierMixin, TransformerMixin):
    
    def __init__(self, clfs, voting = 'hard', weights = None):
        self.clfs = clfs
        self.named_clfs = {key:value for key, value in _name_estimators(clfs)}
        self.voting = voting
        self.weights = weights
        
    def fit(self, X, y):
        self.le_ = LabelEncoder()
        self.le_.fit(y)
        self.classes_ = self.le_.classes_
        self.clfs_ = []
        for clf in self.clfs:
            fitted_clf = clone(clf).fit(X, self.le_.transform(y))
            self.clfs_.append(fitted_clf)
        return self
    
    def predict(self, X):
        if self.voting == 'soft':
            maj = np.argmax(self.predict_proba(X), axis=1)
        else:  # 'hard' voting
            predictions = self._predict(X)
            maj = np.apply_along_axis(
                                      lambda x: np.argmax(np.bincount(x, weights = self.weights)), axis = 1, arr = predictions
                                      )

        maj = self.le_.inverse_transform(maj)
        return maj
    
    def predict_proba(self, X):
        avg = np.average(self._predict_probas(X), axis=0, weights=self.weights)
        return avg
    
    def transform(self, X):
        if self.voting == 'soft':
            return self._predict_probas(X)
        else:
            return self._predict(X)
    
    def get_params(self, deep=True):
        """ Return estimator parameter names for GridSearch support"""
        if not deep:
            return super(EnsembleClassifier, self).get_params(deep=False)
        else:
            out = self.named_clfs.copy()
            for name, step in six.iteritems(self.named_clfs):
                for key, value in six.iteritems(step.get_params(deep=True)):
                    out['%s__%s' % (name, key)] = value
            return out

    def _predict(self, X):
        """ Collect results from clf.predict calls. """
        return np.asarray([clf.predict(X) for clf in self.clfs_]).T

    def _predict_probas(self, X):
        """ Collect results from clf.predict calls. """
        return np.asarray([clf.predict_proba(X) for clf in self.clfs_])
예제 #21
0
def test_label_encoder_string_labels():
    """Test LabelEncoder's transform and inverse_transform methods with
    non-numeric labels"""
    le = LabelEncoder()
    le.fit(["paris", "paris", "tokyo", "amsterdam"])
    assert_array_equal(le.classes_, ["amsterdam", "paris", "tokyo"])
    assert_array_equal(le.transform(["tokyo", "tokyo", "paris"]), [2, 2, 1])
    assert_array_equal(le.inverse_transform([2, 2, 1]), ["tokyo", "tokyo", "paris"])
    assert_raises(ValueError, le.transform, ["london"])
예제 #22
0
def process_one_cell(df_train, df_test, grid_id, th):
    """
    Does all the processing inside a single grid cell: Computes the training
    and test sets inside the cell. Fits a classifier to the training data
    and predicts on the test data. Selects the top 3 predictions.

    Parameters:
    ----------
    df_train: pandas DataFrame
              Training set
    df_test: pandas DataFrame
             Test set
    grid_id: int
             The id of the grid to be analyzed
    th: int
       Threshold for place_id. Only samples with place_id with at least th
       occurrences are kept in the training set.

    Return:
    ------
    pred_labels: numpy ndarray
                 Array with the prediction of the top 3 labels for each sample
    row_ids: IDs of the samples in the submission dataframe
    """
    # Working on df_train
    df_cell_train = df_train.loc[df_train.grid_cell == grid_id]
    place_counts = df_cell_train.place_id.value_counts()
    mask = place_counts[df_cell_train.place_id.values] >= th
    df_cell_train = df_cell_train.loc[mask.values]

    # Working on df_test
    df_cell_test = df_test.loc[df_test.grid_cell == grid_id]
    row_ids = df_cell_test.index

    le = LabelEncoder()
    y = le.fit_transform(df_cell_train.place_id.values)
    X = df_cell_train.drop(['place_id', 'grid_cell'], axis=1).values
    X_test = df_cell_test.drop(['grid_cell'], axis=1).values
    if (X_test.shape[0] > 0):
        # Training Classifier
        if (X.shape[0] == 0):
            print("empty training set - grid_id:"+str(grid_id))

        ##clf = SGDClassifier(loss='modified_huber', n_iter=1, random_state=0, n_jobs=-1)
        #clf = RandomForestClassifier(n_estimators=200)
        #clf = KNeighborsClassifier(n_neighbors=25, weights='distance',
        #                           metric='manhattan')
        clf = tree.DecisionTreeClassifier()

        clf.fit(X, y)
        y_pred = clf.predict_proba(X_test)

        pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:, ::-1][:, :3])
        return pred_labels, row_ids
    else:
        print("X_test.shape == 0 ... ")
        return [], row_ids
예제 #23
0
파일: knn_plus.py 프로젝트: itenyh/kaggle
def process_one_cell(df_train, df_test, x_min, x_max, y_min, y_max):

    x_border_augment = 0.025
    y_border_augment = 0.0125

    #Working on df_train
    df_cell_train = df_train[(df_train['x'] >= x_min-x_border_augment) & (df_train['x'] < x_max+x_border_augment) &
                               (df_train['y'] >= y_min-y_border_augment) & (df_train['y'] < y_max+y_border_augment)]
    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= th).values
    df_cell_train = df_cell_train.loc[mask]

    #Working on df_test
    # to be delete: df_cell_test = df_test.loc[df_test.grid_cell == grid_id]
    df_cell_test = df_test[(df_test['x'] >= x_min) & (df_test['x'] < x_max) &
                               (df_test['y'] >= y_min) & (df_test['y'] < y_max)]
    row_ids = df_cell_test.index

    if(len(df_cell_train) == 0 or len(df_cell_test) == 0):
        return None, None

    #Feature engineering on x and y
    df_cell_train.loc[:,'x'] *= fw[0]
    df_cell_train.loc[:,'y'] *= fw[1]
    df_cell_test.loc[:,'x'] *= fw[0]
    df_cell_test.loc[:,'y'] *= fw[1]

    #Preparing data
    le = LabelEncoder()
    y = le.fit_transform(df_cell_train.place_id.values)
    X = df_cell_train.drop(['place_id'], axis=1).values.astype(float)

    if 'place_id' in df_cell_test.columns:

        cols = df_cell_test.columns
        cols = cols.drop('place_id')

        X_test = df_cell_test[cols].values.astype(float)

    else:

        X_test = df_cell_test.values.astype(float)

    #Applying the classifier
    # clf = KNeighborsClassifier(n_neighbors=26, weights='distance',
    #                            metric='manhattan')
    clf1 = BaggingClassifier(KNeighborsClassifier(n_neighbors=26, weights='distance',
                                metric='manhattan'), n_jobs=-1, n_estimators=50)
    clf2 = RandomForestClassifier(n_estimators=100, n_jobs=-1)

    eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2)], voting='hard')

    eclf.fit(X, y)
    y_pred = eclf.predict_proba(X_test)
    pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3])

    return pred_labels, row_ids
예제 #24
0
파일: xgboost.py 프로젝트: DAVINDAI/xgboost
class XGBClassifier(XGBModel, XGBClassifier):
    __doc__ = """
    Implementation of the scikit-learn API for XGBoost classification
    """ + "\n".join(XGBModel.__doc__.split('\n')[2:])

    def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective="binary:logistic",
                 nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1,
                 base_score=0.5, seed=0, missing=None):
        super(XGBClassifier, self).__init__(max_depth, learning_rate, n_estimators, silent, objective,
                                            nthread, gamma, min_child_weight, max_delta_step, subsample,
                                            colsample_bytree,
                                            base_score, seed, missing)

    def fit(self, X, y, sample_weight=None):
        self.classes_ = list(np.unique(y))
        self.n_classes_ = len(self.classes_)
        if self.n_classes_ > 2:
            # Switch to using a multiclass objective in the underlying XGB instance
            self.objective = "multi:softprob"
            xgb_options = self.get_xgb_params()
            xgb_options['num_class'] = self.n_classes_
        else:
            xgb_options = self.get_xgb_params()

        self._le = LabelEncoder().fit(y)
        training_labels = self._le.transform(y)

        if sample_weight is not None:
            trainDmatrix = DMatrix(X, label=training_labels, weight=sample_weight,
                                   missing=self.missing)
        else:
            trainDmatrix = DMatrix(X, label=training_labels,
                                   missing=self.missing)

        self._Booster = train(xgb_options, trainDmatrix, self.n_estimators)

        return self

    def predict(self, X):
        testDmatrix = DMatrix(X, missing=self.missing)
        class_probs = self.booster().predict(testDmatrix)
        if len(class_probs.shape) > 1:
            column_indexes = np.argmax(class_probs, axis=1)
        else:
            column_indexes = np.repeat(0, X.shape[0])
            column_indexes[class_probs > 0.5] = 1
        return self._le.inverse_transform(column_indexes)

    def predict_proba(self, X):
        testDmatrix = DMatrix(X, missing=self.missing)
        class_probs = self.booster().predict(testDmatrix)
        if self.objective == "multi:softprob":
            return class_probs
        else:
            classone_probs = class_probs
            classzero_probs = 1.0 - classone_probs
            return np.vstack((classzero_probs, classone_probs)).transpose()
def fit_predict_proba_2clf(X, y, test):
    
    #return test;

    le = LabelEncoder()
    y = le.fit_transform(y)
    
    clf1 = KNeighborsClassifier(n_neighbors=20, 
                                weights=lambda x: x ** -2, metric='manhattan',n_jobs=-1)
    
    #clf1 = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1,
    #                              min_samples_split=4, random_state=0, criterion='entropy')
    
    clf2 = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1,
                                  min_samples_split=4, random_state=0, criterion='gini')
 
    preds_level1 = pd.DataFrame()
    
    row_ids = test.index.values
    
    clf1.fit(X, y)
    y_pred_1 = clf1.predict_proba(test)
    y_pred_1 = dict(zip(le.inverse_transform(clf1.classes_), zip(*y_pred_1)))
    y_pred_1 = pd.DataFrame.from_dict(y_pred_1)
    
    y_pred_1['row_id'] = row_ids
    y_pred_1 = y_pred_1.set_index('row_id')
    y_pred_1.index.name = 'row_id';
    
    clf2.fit(X, y)
    y_pred_2 = clf2.predict_proba(test)
    y_pred_2 = dict(zip(le.inverse_transform(clf2.classes_), zip(*y_pred_2)))
    y_pred_2 = pd.DataFrame.from_dict(y_pred_2)
    
    y_pred_2['row_id'] = row_ids
    y_pred_2 = y_pred_2.set_index('row_id')
    y_pred_2.index.name = 'row_id';
    all_columns = y_pred_1.columns
    y_pred_1.rename(columns = lambda x: str(x)+'_1', inplace=True)
    y_pred_2.rename(columns = lambda x: str(x)+'_2', inplace=True)
    
    preds_level1 = pd.concat([y_pred_1, y_pred_2], axis=1)
    #print preds_level1.shape
    return preds_level1
예제 #26
0
class BaseClassifier(BaseEstimator):

    def predict_proba(self, X):
        if len(self.classes_) != 2:
            raise NotImplementedError("predict_(log_)proba only supported"
                                      " for binary classification")

        if self.loss == "log":
            df = self.decision_function(X).ravel()
            prob = 1.0 / (1.0 + np.exp(-df))
        elif self.loss == "modified_huber":
            df = self.decision_function(X).ravel()
            prob = np.minimum(1, np.maximum(-1, df))
            prob += 1
            prob /= 2
        else:
            raise NotImplementedError("predict_(log_)proba only supported when"
                                      " loss='log' or loss='modified_huber' "
                                      "(%s given)" % self.loss)

        out = np.zeros((X.shape[0], 2), dtype=np.float64)
        out[:, 1] = prob
        out[:, 0] = 1 - prob

        return out

    def _set_label_transformers(self, y, reencode=False, neg_label=-1):
        if reencode:
            self.label_encoder_ = LabelEncoder()
            y = self.label_encoder_.fit_transform(y).astype(np.int32)
        else:
            y = y.astype(np.int32)

        self.label_binarizer_ = LabelBinarizer(neg_label=neg_label,
                                               pos_label=1)
        self.label_binarizer_.fit(y)
        self.classes_ = self.label_binarizer_.classes_.astype(np.int32)
        n_classes = len(self.label_binarizer_.classes_)
        n_vectors = 1 if n_classes <= 2 else n_classes

        return y, n_classes, n_vectors

    def decision_function(self, X):
        pred = safe_sparse_dot(X, self.coef_.T)
        if hasattr(self, "intercept_"):
            pred += self.intercept_
        return pred

    def predict(self, X):
        pred = self.decision_function(X)
        out = self.label_binarizer_.inverse_transform(pred)

        if hasattr(self, "label_encoder_"):
            out = self.label_encoder_.inverse_transform(out)

        return out
class Dataset:
    def __init__(self, frame_size=40, hop_size=3):
        self.frame_size = frame_size
        self.hop_size = hop_size

        path = get_file('nietzsche.txt',
            origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
        self.text = open(path).read().lower()
        print('corpus length:', len(self.text))

        chars = sorted(list(set(self.text)))
        self.class_count = len(chars)
        print('total chars:', self.class_count)

        self.le = LabelEncoder().fit(chars)

        self.text_ohe = self.text_to_ohe(self.text)

        def split_to_frames(values, frame_size, hop_size):
            """
            Split to overlapping frames.
            """
            return np.stack(values[i:i + frame_size] for i in
                range(0, len(values) - frame_size + 1, hop_size))

        def split_features_targets(frames):
            """
            Split each frame to features (all but last element)
            and targets (last element).
            """
            frame_size = frames.shape[1]
            X = frames[:, :frame_size - 1]
            y = frames[:, -1]
            return X, y

        # cut the text in semi-redundant sequences of frame_size characters
        self.X, self.y = split_features_targets(split_to_frames(
            self.text_ohe, frame_size + 1, hop_size))

        print('X.shape:', self.X.shape, 'y.shape:', self.y.shape)

    def ohe_to_text(self, text_ohe):
        return self.le_to_text(text_ohe.argmax(axis=1))

    def text_to_ohe(self, text):
        return self.le_to_ohe(self.text_to_le(list(text)))

    def le_to_text(self, text_le):
        return ''.join(self.le.inverse_transform(text_le))

    def text_to_le(self, text):
        return self.le.transform(text)

    def le_to_ohe(self, text_le):
        return to_categorical(text_le, nb_classes=self.class_count)
class EnsembleClassifier(BaseEstimator, ClassifierMixin, TransformerMixin):

    def __init__(self,clfs,voting='hard',weights=None):

        self.clfs       = clfs
        if voting in ('hard','soft'):
            self.voting = voting
        if weights != None and len(clfs) == len(weights):
            self.weights    = weights
        else:
            self.weights    = None

        self.le         = LabelEncoder()

    def fit(self,X,y):

        for clf in self.clfs:
            clf.fit(X,y)

        self.le.fit(y)

        return self

    def predict(self,X):

        if 'soft' == self.voting:
            average     = self.predict_proba(X)
            majority    = self.le.inverse_transform(np.argmax(average,axis=1))

        else:
            self.classes    = self.predict_classes(X)
            self.classes    = np.asarray([self.classes[:,c] for c in range(self.classes.shape[1])])

            if self.weights:
                self.classes    = np.concatenate([np.tile(self.classes[:,c,None],w) for w,c in zip(self.weights,range(self.classes.shape[1]))],axis=1)

            majority        = np.apply_along_axis(lambda x:np.argmax(np.bincount(x)),axis=1,arr=self.classes)

        return majority

    def transform(self, X):
        if self.weights:
            return self.predict_proba(X)
        else:
            return self.predict_classes(X)

    def predict_proba(self,X):

        self.probability    = np.asarray([clf.predict_proba(X) for clf in self.clfs])
        return np.average(self.probability,axis=0,weights=self.weights)

    def predict_classes(self,X):
        return np.asarray([clf.predict(X) for clf in self.clfs])
예제 #29
0
def process_one_cell(df_cell_train, df_cell_test):


    #Working on df_train
    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= 8).values
    df_cell_train = df_cell_train.loc[mask]

    #Working on df_test
    row_ids = df_cell_test.index

    #Feature engineering on x and y
    df_cell_train.loc[:,'x'] *= 500.0
    df_cell_train.loc[:,'y'] *= 1000.0
    df_cell_test.loc[:,'x'] *= 500.0
    df_cell_test.loc[:,'y'] *= 1000.0

    #print "Preparing data"
    le = LabelEncoder()
    y = le.fit_transform(df_cell_train.place_id.values)
    X = df_cell_train.drop(['place_id'], axis=1).values
    X_test = df_cell_test.values

    #Applying the classifier
    clf_knn = KNeighborsClassifier(n_neighbors=36, weights=calculate_distance,
                               metric='manhattan')
    clf_knn.fit(X, y)
    y_pred_knn = clf_knn.predict_proba(X_test)

    params = {'n_estimators': 100, 'subsample': 0.95, 'learning_rate': 0.15, 'colsample_bytree': 0.7, 'min_child_weight': 6.0}

    clf_xgb = xgb.XGBClassifier(**params)
    clf_xgb.fit(X, y)
    y_pred_xgb = clf_xgb.predict_proba(X_test)

    paras_rf = {'min_samples_split': 7.0, 'max_depth': 12.0, 'random_state':1234}


    clf_rf = RandomForestClassifier(**paras_rf)

    clf_rf.fit(X, y)
    y_pred_rf = clf_rf.predict_proba(X_test)

    #6 4 1
    #3 1 0
    ytotal = 1 * y_pred_xgb + 0.2 * y_pred_knn + 0.07 * y_pred_rf

    #ytotal = y_pred_knn

    pred_labels = le.inverse_transform(np.argsort(ytotal, axis=1)[:,::-1][:,:5])


    return pred_labels, row_ids
class MajorityVoteClassifier(BaseEstimator,
                             ClassifierMixin):
    def __init__(self, classifiers,
                 vote='classlabel', weights=None):
        self.classifiers = classifiers
        self.named_classifiers = {key: value for
                                  key, value in
                                  _name_estimators(classifiers)}
        self.vote = vote
        self.weights = weights
        
    def fit(self, x, y):
        self.lablenc_ = LabelEncoder()
        self.lablenc_.fit(y)
        self.classes_ = self.lablenc_.classes_
        self.classifiers_ = []
        for clf in self.classifiers:
            fitted_clf = clone(clf).fit(x,
                                       self.lablenc_.transform(y))
            self.classifiers_.append(fitted_clf)
        return self
    
    def predict(self, x):
        if self.vote == 'probability':
            maj_vote = np.argmax(self.predict_proba(x),
                                 axis=1)
        else:
            predictions = np.asarray([clf.predict(x)
                                      for clf in
                                      self.classifiers_]).T
            
            maj_vote = np.apply_along_axis(lambda x: np.argmax(np.bincount(x, weights=self.weights)),
                                           axis=1,
                                           arr=predictions)
            maj_vote = self.lablenc_.inverse_transform(maj_vote)
            return maj_vote
        
    def predict_proba(self, x):
        probas = np.asarray([clf.predict_proba(x)
                             for clf in self.classifiers_])
        avg_proba = np.average(probas, axis=0, weights=self.weights)
        return avg_proba
    
    def get_params(self, deep=True):
        if not deep:
            return super(MajorityVoteClassifier, self).get_params(deep=False)
        else:
            out = self.named_classifiers.copy()
            for name, step in six.iteritems(self.named_classifiers):
                for key, value in six.iteritems(step.get_params(deep=True)):
                    out['%s__%s' % (name, key)] = value
            return out
예제 #31
0
class AutoConverter():
    def __init__(self,
                 target,
                 strategy='auto',
                 coltype_converters={},
                 column_converters={},
                 use_column_converter_only=True,
                 n_jobs=1):
        """Big wrapping class for convertors

        Args:
            target (str): target column name
            strategy (str): {'auto'}
            coltype_converters (dict): dict of customized Transformers
            column_converters (dict): dict of customized column transformers
            use_column_converter_only (bool): Use only column converter or not
            n_jobs (int): n_jobs parameter for FeatureUnion

        column_converters will be applied to columns on a priority basis.
        If use_column_converter == True (default value),
        pre-defined transformers in TransformerCatalog will NOT be applied.

        Therefore, giving an empty list to a column can be used to "ignore"
        the column for feature extraction.

        In the following example, only TfIdfVectorizer with default parameters
        will be applied to "Name" column and no transformer will be applied to
        "Age" column.

        column_converters={"Name": [(TfIdfVectorizer, {})],
                           "Age": []}

        """

        self.target = target
        self.strategy = strategy
        self.feature_names = []
        self.X = None
        self.y = None
        self.hasdata = False
        self.target_le = LabelEncoder()
        self.subtables_ = None
        self.converter_catalog = None
        self.set_converter(coltype_converters)
        self.column_converters = column_converters
        self.use_column_converter_only = use_column_converter_only
        self.n_jobs = n_jobs

    def set_converter(self, coltype_converters):
        """Insert customized transformers into self.converter_catalog."""
        # TODO(Yoshi): Technically, dict.update overwrite existing entry
        # We might want to "append" instead. To be discussed.
        self.converter_catalog = (
            DefaultTransformerCatalog.transformer_dict.copy())
        self.converter_catalog.update(coltype_converters)

    def fit(self, df, subtables=None, y=None, custom_types={}):
        """Fits the data to the custom_converters

        Args:
            df (pd.DataFrame): main dataframe table

            subtables (dictionary): dictionary of subtables with keys for
                linking them to main table. Default: None.
                subtables =
                    {tabname(str) : { "table": (pd.Dataframe),
                        "link_key": (str) main table column name,
                        "group_key": (str) this table column name,
                        "custom_aggregator": (dict) col_type:aggregator_class}}
                Example:
                    {"school_table": {"table": school_df,
                                      "link_key": "school_id",
                                      "group_key": "id",
                                      "custom_aggregator": {"text":
                                          CustomTextAggregator()}
                                     }
                    }
            custom_types (dictionary): dictionary of col_types that overrides
                col_type_dicts made by auto_converter orcibly

        Returns:
            self

        """
        assert self.target in df

        # filtering None
        df.dropna(subset=[self.target], inplace=True)
        # filterung NaN
        df = df[df[self.target].notnull()]

        self.target_le.fit(df[self.target].as_matrix())

        X_df = df.drop(self.target, axis=1)

        # 1. typing columns
        self.colname_type_dict = type_columns(X_df)
        if isinstance(custom_types, dict):
            self.colname_type_dict.update(custom_types)

        # 2. Pre-imputing missing values for textual column
        for colname in X_df.columns:
            if (self.colname_type_dict[colname] == 'text'
                    or self.colname_type_dict[colname] == 'categorical'
                    or self.colname_type_dict[colname] == 'text_ja'):
                X_df.loc[:, colname] = X_df[colname].fillna("NaN").astype(str)

        # 3. create feature union
        transformer_list = []
        for colname in X_df.columns:

            if colname in self.column_converters:
                for transformer_cls, kwargs in self.column_converters[colname]:
                    transformer_list.append(
                        (u"{}.{}".format(colname, transformer_cls.__name__),
                         transformer_cls(colname, **kwargs)))
                if self.use_column_converter_only:
                    # Since transformer(s) are defined by users,
                    # skip automatic assignment of transformers for this column
                    continue

            assert colname in self.colname_type_dict
            coltype = self.colname_type_dict[colname]

            if coltype == 'ignore':
                continue

            if coltype == 'date':
                # we don't want to pass np array to date transformer,
                # instead we pass pandas df
                # TODO(Yoshi): This is hard-coded??
                d = DateTransformer(colname=colname)
                transformer_list.append((u"{}.{}".format(colname, 'date'), d))
                continue

            t_dict = self.converter_catalog[coltype]
            for transformer in t_dict:
                transformer_cls = transformer[0]
                kwargs = transformer[1]
                transformer_list.append(
                    (u"{}.{}".format(colname, transformer_cls.__name__),
                     transformer_cls(colname, **kwargs)))

        # 4. fit feature union
        if transformer_list:  # if there's something to transform
            self.fu = HeterogeneousFeatureUnion(transformer_list,
                                                n_jobs=self.n_jobs)
            self.fu.fit(X_df)

            feature_names = list(
                map(lambda x: 'main..' + text_type(x),
                    self.fu.get_feature_names()))
        else:  # emppty main table (only target and ignore types)
            # we assume there exist information in subtables then
            if not subtables:
                raise ValueError("There's nothing to transform")
            self.fu = None
            feature_names = []

        # defining Aggregator structure and fitting the tables in
        if subtables:
            self.subtables_ = subtables

            for key in sorted(list(subtables.keys())):
                subtable_dict = subtables[key]

                if subtable_dict['link_key'] not in X_df.columns:
                    raise KeyError("Link key " + subtable_dict['link_key'] +
                                   " does not exist in the main table")

                aggr = AutoAggregator(group_key=subtable_dict['group_key'],
                                      custom_aggregators=subtables.get(
                                          "custom_aggregator", {}))
                self.subtables_[key]['aggr'] = aggr
                aggr.fit(subtable_dict['table'])
                self.colname_type_dict[key] = aggr.colname_type_dict.copy()

                # gathering feature names from subtables
                append_list = list(
                    map(lambda x: text_type(key) + '..' + text_type(x),
                        aggr.feature_names))
                feature_names.extend(append_list)

        self.feature_names = feature_names

        return self

    def transform(self, df, subtables=None, prediction=False):
        """Transforms data to feature matrix

        Args:
            df (pandas.DataFrame): data to transform
            subtables (dictionary): dictionary of subtables with keys for
                linking them to main table. Default: None.
                subtables =
                    {tabname(str) : { "table": (pd.Dataframe),
                        "link_key": (str) main table column name,
                        "group_key": (str) this table column name }}
                Example:
                    {"school_table": {"table": shool_pd},
                                      "link_key": "school_id",
                                      "group_key": "id" }
                    }

            prediction (bool): Returns only X if True

        Returns:
            X (numpy.ndarray): feature matrix
            y (array-like of shape [n_samples]): target vector

        """

        if not prediction:
            # filtering None
            df.dropna(subset=[self.target], inplace=True)
            # filterung NaN
            df = df[df[self.target].notnull()]

            # TODO(Yoshi): Should display Warning message when transform
            # is called with prediction=False if self.hasdata is True
            if self.hasdata:
                print("[WARNING] This instance already has been fitted.")
            assert self.target in df
            y_unique = df[self.target].unique()

            if len(y_unique) == 1 and np.isnan(y_unique[0]):
                # this just leaves y equal to a np.nan vector of the same size
                # TODO(Yoshi): This should raise exception.
                #              Will revise here after specifying exceptions
                y = df[self.target]
            else:
                y = self.target_le.transform(df[self.target].as_matrix())

            X_df = df.drop(self.target, axis=1)

        else:
            # Prediction
            if self.subtables_ is not None:
                assert subtables is not None

            if self.target in df:
                X_df = df.drop(self.target, axis=1)
            else:
                X_df = df

        # TODO(later): Pre-imputing. This part could be redundant
        for colname in X_df.columns:
            if self.colname_type_dict[colname] in [
                    'categorical', 'text', 'text_ja'
            ]:
                X_df.loc[:, colname] = X_df[colname].fillna("NaN").astype(str)

        if self.fu:
            X = self.fu.transform(X_df)
        else:
            # Creating the empty matrix of the same size to use it later during
            # data aggregation, since we can't use feature union in absence of
            # features
            X = np.empty([X_df.shape[0], 0])

        # Ad-hoc way to convert sparse matrix into numpy.array and replace NaN
        # values with 0.0
        if type(X) == sp.sparse.csr.csr_matrix:
            X = X.toarray()
        X[np.isnan(X)] = 0.0

        # transforming subtables and concating them with main table feature
        # matrix
        if subtables:
            # TODO(Kate): make sure that subtables passed and subtables stored
            # have the same structure. Any ideas?
            X_gather = pd.DataFrame(X)

            for key in sorted(list(subtables.keys())):
                subtable = subtables[key]
                aggr = subtable['aggr']
                link_key = subtable['link_key']
                X_sub = aggr.transform(subtable['table'])
                # combine X_gather with subtable['link_key']

                if link_key in X_gather.columns.tolist():
                    raise KeyError('column already exists in a dataframe' +
                                   link_key)

                X_gather[link_key] = df[link_key]
                # X_sub is already a pd.DataFrame with group_key included
                # as index
                X_gather = X_gather.merge(X_sub,
                                          how='left',
                                          left_on=link_key,
                                          right_index=True)
                # make sure we don't leave anything(index) behind ;)
                del X_gather[link_key]

                # do something with get_feature_names

            X = X_gather.as_matrix()

        # TODO(Yoshi): Post pre-processing such as missing value imputation
        # TODO(Yoshi): Tentative naive replacement of NaN values
        X = np.nan_to_num(X)

        if not prediction:
            self.X = X
            self.y = y
            self.hasdata = True
            return [X, y]
        else:
            return X

    def fit_transform(self, df, subtables=None, y=None):
        """Fit + Transform

        Args:
            df (pandas.DataFrame): main df
            subtables (dict): dictionary of subtables

        Returns:
            X (numpy.ndarray): feature matrix
            y (array-like of shape [n_samples]): target vector

        """

        return self.fit(df, subtables).transform(df, subtables)

    def index2label(self, predictions):
        """Transforms predictions from numerical format back to labels

        Args:
            predictions (np.array): array of label numbers

        Returns:
            labels (np.array): array of label values

        """

        return self.target_le.inverse_transform(predictions)

    def get_feature_names(self, colname=None):
        """Returns feature names

        Args:
            colname (str or tuple): column name
            if colname is a tuple (subtable name, colname)
            if None returns all feature names
            (default: None)

        Returns:
            feature_names (list)

        """

        if colname is None:
            if len(self.feature_names) == 0:
                # TODO(Yoshi): Do we want to use a "trained" flag instead?
                print("[WARNING]:",
                      "AutoConverter instance has extracted no feature.",
                      "Probably, it has not been fit to data yet.")
            return self.feature_names

        # Use tuple (or list) to handle subtable feature names
        if type(colname) in [tuple, list]:
            # TODO(Yoshi): replace with Exception
            assert len(colname) == 2
            colname_ = "..".join(colname)
        else:
            # colname is in main table
            colname_ = "main..{}".format(colname)

        colname_idx_list = list(
            filter(lambda x: colname_ in x[1], enumerate(self.feature_names)))
        colname_list = list(map(lambda x: x[1], colname_idx_list))

        return colname_list

    def save(self, filepath, overwrite=False):
        """Save AutoConverter object as pickle file

        Args:
            filepath (str): Output pickle filepath
            overwrite (bool): Overwrites a file with the same name if true

        Returns:
            success_flag (bool)

        """
        if not overwrite and os.path.exists(filepath):
            # TODO(Yoshi): Warning handling
            print("File already exists. Skip.")
            return False

        with open(filepath, "wb") as fout:
            pickle.dump(self, fout)

        return True

    @classmethod
    def load(cls, filepath):
        """Load AutoConverter object from pickle file

        Args:
            filepath (str): Input pickle filepath

        Returns:
            AutoLearn object

        """
        with open(filepath, "rb") as fin:
            obj = pickle.load(fin)
        assert obj.__class__.__name__ == 'AutoConverter'
        return obj
예제 #32
0
from sklearn.model_selection import train_test_split

my_features = my_combined['Cluster'].values.reshape(my_combined.shape[0], 1)
print(np.unique(my_features))
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
rgrps = [0, 1, 2]
le = LabelEncoder()
le.fit(rgrps)

ohe = OneHotEncoder(sparse=False)
le_data = le.transform(my_features).reshape(my_combined.shape[0], 1)
ohecluster = ohe.fit_transform(le_data)
print(ohecluster)
enc = [0, 0, 1]
print(le.inverse_transform(np.argmax(enc).reshape(1, 1)))

labels = my_combined['Purchased'].values.reshape(my_combined.shape[0], 1)
print(labels)

# Evaluate the model by splitting into train and test sets
# Notice the stratify keyword argument.
# Roughly 40% of our data are lost contracts and 60% are won contracts.
# We want our random testing and training data sets to have close to this same ratio.
# Otherwise, we might be training or testing based on a biased sample.
x_train, x_test, y_train, y_test = train_test_split(ohecluster,
                                                    labels,
                                                    test_size=0.4,
                                                    stratify=labels,
                                                    random_state=23)
lr_model = model.fit(x_train, y_train)
예제 #33
0
def main():
    file = "../../Resources/data/AudioFile/livefile.wav"

    sns.set()  # Use seaborn's default style to make attractive graphs

    # Plot nice figures using Python's "standard" matplotlib library
    snd = parselmouth.Sound(file)
    plt.figure(figsize=(15, 5))
    plt.plot(snd.xs(), snd.values.T)
    plt.xlim([snd.xmin, snd.xmax])
    plt.xlabel("time [s]")
    plt.ylabel("amplitude")
    #plt.show() or plt.savefig("Resources/images/sound.png")
    plt.savefig("../../Resources/images/sound.png")

    def draw_spectrogram(spectrogram, dynamic_range=70):
        X, Y = spectrogram.x_grid(), spectrogram.y_grid()
        sg_db = 10 * np.log10(spectrogram.values)
        plt.pcolormesh(X,
                       Y,
                       sg_db,
                       vmin=sg_db.max() - dynamic_range,
                       cmap='afmhot')
        plt.ylim([spectrogram.ymin, spectrogram.ymax])
        plt.xlabel("time [s]")
        plt.ylabel("frequency [Hz]")

    def draw_intensity(intensity):
        plt.plot(intensity.xs(), intensity.values.T, linewidth=3, color='w')
        plt.plot(intensity.xs(), intensity.values.T, linewidth=1)
        plt.grid(False)
        plt.ylim(0)
        plt.ylabel("intensity [dB]")

    intensity = snd.to_intensity()
    spectrogram = snd.to_spectrogram()
    plt.figure()
    draw_spectrogram(spectrogram)
    plt.twinx()
    draw_intensity(intensity)
    plt.xlim([snd.xmin, snd.xmax])
    plt.savefig("../../Resources/images/spectrogram.png")

    def draw_pitch(pitch):
        # Extract selected pitch contour, and
        # replace unvoiced samples by NaN to not plot
        pitch_values = pitch.selected_array['frequency']
        pitch_values[pitch_values == 0] = np.nan
        plt.plot(pitch.xs(), pitch_values, 'o', markersize=5, color='w')
        plt.plot(pitch.xs(), pitch_values, 'o', markersize=2)
        plt.grid(False)
        plt.ylim(0, pitch.ceiling)
        plt.ylabel("fundamental frequency [Hz]")

    pitch = snd.to_pitch()
    # If desired, pre-emphasize the sound fragment before calculating the spectrogram
    pre_emphasized_snd = snd.copy()
    pre_emphasized_snd.pre_emphasize()
    spectrogram = pre_emphasized_snd.to_spectrogram(window_length=0.03,
                                                    maximum_frequency=8000)
    plt.figure()
    draw_spectrogram(spectrogram)
    plt.twinx()
    draw_pitch(pitch)
    plt.xlim([snd.xmin, snd.xmax])
    plt.savefig("../../Resources/images/spectrogram_0.03.png")

    #livedf= pd.DataFrame(columns=['feature'])
    X, sample_rate = librosa.load(file,
                                  res_type='kaiser_fast',
                                  duration=2.5,
                                  sr=22050 * 2,
                                  offset=0.5)
    sample_rate = np.array(sample_rate)
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13),
                    axis=0)
    featurelive = mfccs
    livedf2 = featurelive

    livedf2 = pd.DataFrame(data=livedf2)
    livedf2 = livedf2.stack().to_frame().T
    livedf2

    json_file = open('model.json', 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    # load weights into new model
    loaded_model.load_weights("saved_models/Emotion_Voice_Detection_Model.h5")

    twodim = np.expand_dims(livedf2, axis=2)

    livepreds = loaded_model.predict(twodim, batch_size=32, verbose=1)

    livepreds1 = livepreds.argmax(axis=1)

    liveabc = livepreds1.astype(int).flatten()
    print(liveabc)
    lb = LabelEncoder()
    y_train = load('y_train.npy', allow_pickle=True)
    y_test = load('y_test.npy', allow_pickle=True)
    y_train = np_utils.to_categorical(lb.fit_transform(y_train))
    y_test = np_utils.to_categorical(lb.fit_transform(y_test))
    livepredictions = str(lb.inverse_transform((liveabc))[0])
    gender_emotion = livepredictions.split('_')
    gender = gender_emotion[0].capitalize()
    emotion = gender_emotion[1].capitalize()
    return gender, emotion
예제 #34
0
class BoWS(BaseEstimator, TransformerMixin):
    def __init__(self, min_df=2, stop_words='english', alpha=0.1):
        self.min_df = min_df
        self.stop_words = stop_words
        self._cv = StemmedTfidfVectorizer(min_df=self.min_df,
                                          stop_words=self.stop_words)
        self._le = LabelEncoder()
        self.alpha = alpha
        self._fitted_ = False
        self.models_ = []

    def __del__(self):
        del self.models_[:]

    def fit(self, X_texts, y=None):
        if y is None:
            raise TypeError("y can't be None")

        a = list(zip(X_texts, y))
        shuffle(a)
        X_texts, scores = list(zip(*a))
        X_texts = list(X_texts)
        y = list(scores)

        X_TF = self._cv.fit_transform(X_texts).tocsr()
        X = self._build_binary_cooccur_matrix_(X_TF)
        y = self._normalize_y_(y)

        self._build_auxiliar_features(X, y)
        self._build_class_models_()

        del self.Ntc_
        del self.Nt_
        del self.Pt_

        self._fitted_ = True

        return self

    def transform(self, X_texts):
        if not self._fitted_:
            raise TypeError("The model did'nt fit yet!")

        X_TF = self._cv.transform(X_texts).tocsr()
        X = self._build_binary_cooccur_matrix_(X_TF)
        X_classes = {}
        for c in range(self.C_):
            X_classes[self._le.inverse_transform(
                [c])[0]] = transform_class_repr(
                    X, self.models_[c].copy()).multiply(X_TF)
        return X_classes

    def _build_binary_cooccur_matrix_(self, X_TF):
        X = sp.csr_matrix((np.ones(len(X_TF.data)), X_TF.nonzero()),
                          shape=X_TF.shape)
        del X_TF
        return X

    def _normalize_y_(self, y):
        return self._le.fit_transform(y)

    def _build_auxiliar_features(self, X, y):
        # número de documentos
        self.N_ = X.shape[0]

        # tamanho do vocabulário
        self.V_ = X.shape[1]

        # Número de classes
        self.C_ = max(y) + 1

        # Número de cada co-ocrrência por classe
        self.Ntc_ = [
            sp.lil_matrix((self.V_, self.V_)) for _ in range(max(y) + 1)
        ]
        for i, doc_matrix in tqdm(enumerate(generate_lines(X)),
                                  total=self.N_,
                                  desc='Building class representations'):
            self.Ntc_[y[i]] = (self.Ntc_[y[i]] + doc_matrix)
        ### Remove diagonal principal
        for i in range(len(self.Ntc_)):
            self.Ntc_[i].setdiag(0)
            self.Ntc_[i].eliminate_zeros()

        # frequencia de cada co-ocorrência por classe
        self.Nt_ = np.sum(self.Ntc_)

        # priori de cada termo P(t)
        self.Pt_ = self.Nt_ / self.N_
        self.Pt_.eliminate_zeros()

    def _build_class_models_(self):
        self.models_ = []
        for i in tqdm(range(self.C_), total=self.C_, desc='Building Models'):
            # Probabilidade P(t,c)
            data = np.array(self.Ntc_[i][self.Nt_.nonzero()] /
                            self.Nt_[self.Nt_.nonzero()])[0]
            Ptc = sp.csr_matrix((data, self.Nt_.nonzero()),
                                shape=self.Nt_.shape)

            # Jenilek-Mercer smoothing
            norm_Ptc = (1. - self.alpha) * Ptc + self.alpha * self.Pt_

            # P*sqrt(n)
            data1 = np.multiply(norm_Ptc[norm_Ptc.nonzero()],
                                np.sqrt(self.Ntc_[i][norm_Ptc.nonzero()]))
            # 2*sqrt( p(1-p) )
            data2 = 2. * np.sqrt(
                np.multiply(norm_Ptc[norm_Ptc.nonzero()],
                            1. - norm_Ptc[norm_Ptc.nonzero()]))

            CI_dominance_smooth = sp.csr_matrix(
                (np.array(data1 / data2)[0], norm_Ptc.nonzero()),
                shape=norm_Ptc.shape)

            del data
            del data1
            del data2

            max_prob = (1. - self.alpha
                        ) * Ptc.data.max() + self.alpha * self.Pt_.data.max()
            max_size_ic = (max_prob * np.sqrt(self.Ntc_[i].data.max())) / (
                2. * np.sqrt(max_prob * (1. - max_prob)))

            CI_dominance_smoooth_norm = CI_dominance_smooth / max_size_ic
            CI_dominance_smoooth_norm.eliminate_zeros()

            self.models_.append(CI_dominance_smoooth_norm)
            del CI_dominance_smooth
            del norm_Ptc
            del Ptc
예제 #35
0
파일: xgb.py 프로젝트: brian50208lee/ML2017
	for cv_train_index, cv_test_index in kf:
		xg_train = xgboost.DMatrix(train.values[cv_train_index, :], label=train_labels.iloc[cv_train_index].values.flatten())
		xg_test = xgboost.DMatrix(train.values[cv_test_index, :], label=train_labels.iloc[cv_test_index].values.flatten())

		xgclassifier = 	xgboost.train(
							params, xg_train, 
							num_boost_round=params['num_round'], 
							evals=[(xg_train, 'train'), (xg_test, 'test')],
							early_stopping_rounds=50
						)
		all_best_rounds.append(xgclassifier.best_iteration)
	best_boost_round = int(np.mean(all_best_rounds))
	print('The best n_rounds is %d' % best_boost_round)

	# build final model
	xg_train = xgboost.DMatrix(train, label=train_labels.values.flatten())
	xg_test = xgboost.DMatrix(test)

	final_round = int(best_boost_round * 1.2)
	xgclassifier = xgboost.train(params, xg_train, final_round, evals=[(xg_train, 'train')])
	xgclassifier.save_model(best_model_path)

# prediction
print('writing to file')
preds = xgclassifier.predict(xg_test).astype(int)
preds = label_encoder.inverse_transform(preds)
submission_file = pd.DataFrame.from_csv(submission_format_path)
submission_file['status_group'] = preds
submission_file.to_csv(prediction_path)

예제 #36
0
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(16,
                              input_shape=(len(features), ),
                              activation='relu',
                              name='fc1'),  #layer 1
        tf.keras.layers.Dense(8, activation='relu', name='fc2'),  #layer 2
        tf.keras.layers.Dense(num_classes, activation='softmax', name='output')
    ])

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model


model = get_compiled_model()
model.fit(X_train, y_train, batch_size=100, epochs=200)

results = model.evaluate(X_test, y_test)

print('Final test set loss: {:4f}'.format(results[0]))
print('Final test set accuracy: {:4f}'.format(results[1]))

pitches = le.inverse_transform(data["pitch_type"].unique())

data["pitch_type"].unique()

some_data = data.sample(n=1)
ynew = model.predict_classes(some_data[features])
print(some_data)
print(le.inverse_transform(ynew))
예제 #37
0
class AirQuality:

    dataset = ""
    x = ""
    y = ""
    x_train = ""
    x_test = ""
    y_train = ""
    y_test = ""
    RandomForestModel = ""
    XgbModel = ""
    SvmModel = ""
    DecisionTreeModel = ""
    le_X_city = ""
    le_X_date = ""
    le_Y = ""

    def readCsv(self, file_name):

        #Importing the datset
        self.dataset = pd.read_csv(file_name)

        self.dataset.dropna(axis=0,
                            subset=[
                                "Air_quality", "Xylene", "AQI", "Toluene",
                                "Benzene", "O3", "SO2", "CO", "NH3", "NOx",
                                "NO2", "PM10", "PM2.5", "NO"
                            ],
                            how='all',
                            inplace=True)
        self.dataset.dropna(subset=["Air_quality"], inplace=True)
        #         print("asasd")

        self.x = self.dataset.iloc[:, :-1].values
        self.y = self.dataset.iloc[:, 15].values

        #Filling the missing values
        imputer = SimpleImputer(missing_values=np.nan, strategy='median')
        imputer = imputer.fit(self.x[:, 2:15])
        self.x[:, 2:15] = imputer.transform(self.x[:, 2:15])

        #Encoding the attributes
        self.le_X_city = LabelEncoder()
        self.le_X_date = LabelEncoder()
        self.le_Y = LabelEncoder()
        self.y = self.le_Y.fit_transform(self.y)

        self.x[:, 0] = self.le_X_city.fit_transform(self.x[:, 0])
        self.x[:, 1] = self.le_X_date.fit_transform(self.x[:, 1])

        #Splitting the dataset
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(
            self.x, self.y, test_size=0.3, random_state=0)

        ax = sns.countplot(self.y_train)
        #plt.bar(['Good','Moderate','Poor','Satisfactory','Severe','Very Poor'], height=, kwargs)
        #plt.hist(self.y_train, color='green')
        #plt.show()
        print('Classes and number of values in trainset',
              Counter(self.y_train))
        from imblearn.over_sampling import SMOTE
        oversample = SMOTE()
        self.x_train, self.y_train = oversample.fit_resample(
            self.x_train, self.y_train)
        print('Classes and number of values in trainset after SMOTE:',
              Counter(self.y_train))
        self.med = np.median(self.x_train, axis=0)

        sns.countplot(self.y)

        #plt.hist(self.y_train, color='green')
        #plt.show()

    def trainRF(self):
        self.RandomForestModel = RandomForestClassifier(n_estimators=100,
                                                        random_state=0)
        self.RandomForestModel.fit(self.x_train, self.y_train)

    def trainXGB(self):
        self.XgbModel = XGBClassifier(random_state=0)
        self.XgbModel.fit(self.x_train, self.y_train)

    def trainSVM(self):
        self.SvmModel = SVC(kernel="rbf", random_state=0)
        self.SvmModel.fit(self.x_train, self.y_train)

    def trainDT(self):
        self.DecisionTreeModel = DecisionTreeClassifier(random_state=0)
        self.DecisionTreeModel.fit(self.x_train, self.y_train)

    def RandomForest(self):
        #RandomForstClassifier Model

        self.y_pred = self.RandomForestModel.predict(self.x_test)

        cm = confusion_matrix(self.y_test, self.y_pred)
        print(cm)

        a = accuracy_score(self.y_test, self.y_pred)
        precision = precision_score(self.y_test, self.y_pred, average='micro')
        recall = recall_score(self.y_test, self.y_pred, average='micro')
        f1 = f1_score(self.y_test, self.y_pred, average='micro')

        return cm, a * 100, precision * 100, recall * 100, f1 * 100

    def XGB(self):
        #XGBCLassifier Model

        self.y_pred = self.XgbModel.predict(self.x_test)

        cm = confusion_matrix(self.y_test, self.y_pred)
        print(cm)

        a = accuracy_score(self.y_test, self.y_pred)
        precision = precision_score(self.y_test, self.y_pred, average='micro')
        recall = recall_score(self.y_test, self.y_pred, average='micro')
        f1 = f1_score(self.y_test, self.y_pred, average='micro')
        return cm, a * 100, precision * 100, recall * 100, f1 * 100

    def SVC(self):
        #SVC Model

        self.y_pred = self.SvmModel.predict(self.x_test)

        cm = confusion_matrix(self.y_test, self.y_pred)
        print(cm)

        a = accuracy_score(self.y_test, self.y_pred)
        precision = precision_score(self.y_test,
                                    self.y_pred,
                                    average='weighted')
        recall = recall_score(self.y_test, self.y_pred, average='weighted')
        f1 = f1_score(self.y_test, self.y_pred, average='weighted')
        return cm, a * 100, precision * 100, recall * 100, f1 * 100

    def DecisionTree(self):
        #DecisionTreeClassifier Model

        self.y_pred = self.DecisionTreeModel.predict(self.x_test)

        cm = confusion_matrix(self.y_test, self.y_pred)
        print(cm)

        a = accuracy_score(self.y_test, self.y_pred)
        precision = precision_score(self.y_test, self.y_pred, average='micro')
        recall = recall_score(self.y_test, self.y_pred, average='micro')
        f1 = f1_score(self.y_test, self.y_pred, average='micro')
        return cm, a * 100, precision * 100, recall * 100, f1 * 100

    def predict(self, City, Date, PM25, PM10, NO, NO2, NOx, NH3, CO, SO2, O3,
                Benzene, Toluene, Xylene, AQI):

        res = []
        city = self.le_X_city.fit_transform([City])
        date = self.le_X_date.fit_transform([Date])

        if (not PM25):
            PM25val = self.med[2]
        else:
            PM25val = float(PM25)
        if (not PM10):
            PM10val = self.med[3]
        else:
            PM10val = float(PM10)
        if (not NO):
            NOval = self.med[4]
        else:
            NOval = float(NO)
        if (not NO2):
            NO2val = self.med[5]
        else:
            NO2val = float(NO2)
        if (not NOx):
            NOxval = self.med[6]
        else:
            NOxval = float(NOx)
        if (not NH3):
            NH3val = self.med[7]
        else:
            NH3val = float(NH3)
        if (not CO):
            COval = self.med[8]
        else:
            COval = float(CO)
        if (not SO2):
            SO2val = self.med[9]
        else:
            SO2val = float(SO2)
        if (not O3):
            O3val = self.med[10]
        else:
            O3val = float(O3)
        if (not Benzene):
            Benzeneval = self.med[11]
        else:
            Benzeneval = float(Benzene)
        if (not Toluene):
            Tolueneval = self.med[12]
        else:
            Tolueneval = float(Toluene)
        if (not Xylene):
            Xyleneval = self.med[13]
        else:
            Xyleneval = float(Xylene)
        if (not AQI):
            AQIval = self.med[14]
        else:
            AQIval = float(AQI)

        ls = [
            city[0], date[0], PM25val, PM10val, NOval, NO2val, NOxval, NH3val,
            COval, SO2val, O3val, Benzeneval, Tolueneval, Xyleneval, AQIval
        ]
        lst = []
        lst.append(ls)

        temp = self.le_Y.inverse_transform(self.RandomForestModel.predict(lst))
        temp = temp.tolist()
        res.append(temp[0])

        temp = self.le_Y.inverse_transform(self.SvmModel.predict(lst))
        temp = temp.tolist()
        res.append(temp[0])

        temp = self.le_Y.inverse_transform(self.DecisionTreeModel.predict(lst))
        temp = temp.tolist()
        res.append(temp[0])

        #         print(lst)
        ll = np.array(lst).reshape(1, -1)
        #         print(ll)
        temp = self.le_Y.inverse_transform(self.XgbModel.predict(ll))
        temp = temp.tolist()
        res.append(temp[0])

        return res
예제 #38
0
            xg_train = xgboost.DMatrix(train,
                                       label=train_labels.values.flatten())
            xg_test = xgboost.DMatrix(test)

            watchlist = [(xg_train, 'train')]

            xgclassifier = xgboost.train(params, xg_train, num_round,
                                         watchlist)
            predicted_results = xgclassifier.predict(xg_test)
            mc_pred.append(predicted_results)

        meta_solvers_test.append(
            (np.mean(np.array(mc_pred), axis=0) + 0.5).astype(int))
        """ Write opt solution """
        print('writing to file')
        mc_train_pred = label_encoder.inverse_transform(
            mc_train_pred.astype(int))
        print(meta_solvers_test[-1])
        meta_solvers_test[-1] = label_encoder.inverse_transform(
            meta_solvers_test[-1])
        pd.DataFrame(mc_train_pred).to_csv('results/train_xgboost_d6.csv')
        submission_file['status_group'] = meta_solvers_test[-1]
        submission_file.to_csv("results/test_xgboost_d6.csv")

    # saving best score for printing
    if mc_acc_mean[-1] < best_score:
        print('new best log loss')
        best_score = mc_acc_mean[-1]
        best_params = params
        best_train_prediction = mc_train_pred
        if params['mc_test']:
            best_prediction = meta_solvers_test[-1]
예제 #39
0
pred=model.predict(X_test)


# In[12]:

'''evaluate predictions'''
accuracy = accuracy_score(y_test, pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))


# In[13]:

'''select other data attributes after prediction'''

recommendation=pd.DataFrame({'user':X_test['user'],'bookName':le2.inverse_transform(X_test['bookName']),'impression': pred })


# In[14]:

'''convert prediction column real responses'''

recommendation['impression'].replace( 1 ,'dislike',inplace=True)
recommendation['impression'].replace(2,'like',inplace=True)
recommendation['impression'].replace(3,'view',inplace=True)
recommendation['impression'].replace(4,'interact',inplace=True)
recommendation['impression'].replace(5,'add to cart',inplace=True)
recommendation['impression'].replace(6,'checkout',inplace=True)


# In[15]:
예제 #40
0
in_encoder = Normalizer(norm='l2')
encodings = in_encoder.transform(encodings)
# label encode targets
out_encoder = LabelEncoder()
out_encoder.fit(names)
names = out_encoder.transform(names)

# Create and train the SVC classifier
clf = svm.SVC(gamma='scale', probability=True)
#clf = svm.SVC(kernel='linear', probability=True)
clf.fit(encodings, names)

# Load the test image with unknown faces into a numpy array
test_image = face_recognition.load_image_file('test/test.jpg')

# Find all the faces in the test image using the default HOG-based model
face_locations = face_recognition.face_locations(test_image)
no = len(face_locations)
print("Number of faces detected: ", no)

# Predict all the faces in the test image using the trained classifier
print("Found:")
for i in range(no):
    test_image_enc = face_recognition.face_encodings(test_image)[i]
    test_image_enc = in_encoder.transform([test_image_enc])
    name = clf.predict(test_image_enc)
    prob = clf.predict_proba(test_image_enc)
    print(prob)
    acc = prob[0, name[0]]
    name = out_encoder.inverse_transform(name)
    print(*name, acc)
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np
import pandas as pd

items = ['TV', '냉장고', '전자레인지', '컴퓨터', '선풍기', '선풍기', '믹서', '믹서']

# LabelEncoder를 객체로 생성한 후,  fit()과 transform()으로 레이블 인코딩 수행
encoder = LabelEncoder()
encoder.fit(items)
labels = encoder.transform(items)

# vector values
labels = labels.reshape(-1, 1)
print('인코딩 변경값: ', labels)
# Encoding
print('인코딩 클래스: ', encoder.classes_)
# Decoding
print('디코딩 원본 값',  encoder.inverse_transform([4, 5, 2, 0, 1, 1, 3, 3]))

oh_encoder = OneHotEncoder()
oh_encoder.fit(labels)
oh_labels = oh_encoder.transform(labels)
print('One-Hot encoding Data')
print(oh_labels.toarray())
print('One-Hoe encoding Shape')
print(oh_labels.shape)

df = pd.DataFrame({'item':items})
# One-Hot Encoder API -> get_dummies()
pd.get_dummies(df)
예제 #42
0
CNN_Model.add(SeqSelfAttention(attention_width=8, attention_activation='sigmoid', name='Attention',))

CNN_Model.add(SpatialDropout1D(0.3))
CNN_Model.add(layers.Conv1D(512, 3, activation='relu'))


CNN_Model.add(layers.GlobalMaxPooling1D())


CNN_Model.add(layers.Dense(3, activation='softmax'))

CNN_Model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy'])

CNN_Model.summary()


#==============================================================================
# Evaluate model and print results
#==============================================================================

CNN_History=CNN_Model.fit(x_train, y_train, epochs = 5, batch_size = 256,verbose=1, validation_data=(x_val,y_val), shuffle=True)

plot_history(CNN_History)

full_multiclass_report(CNN_Model, x_val, y_val, encoder.inverse_transform(np.arange(3)))




# mapping ordinal features
size_mapping = {'XL': 3, 'L': 2,  'M': 1}
df['size'] = df['size'].map(size_mapping)
print(df)

class_mapping = {label: idx for idx, label in enumerate(np.unique(df['classLabel']))}
print(class_mapping)

df['classLabel'] = df['classLabel'].map(class_mapping)
print(df)

inv_class_mapping = {v: k for k, v in class_mapping.items()}
df['classLabel'] = df['classLabel'].map(inv_class_mapping)
print(df)

class_encoder = LabelEncoder()
y = class_encoder.fit_transform(df['classLabel'].values)
print(y)
print(class_encoder.inverse_transform(y))

x = df [['color', 'size', 'price']].values
class_encoder = LabelEncoder()
x[:, 0] = class_encoder.fit_transform(x[:, 0])
print(x)

# one-hot encoding
one_encoder = OneHotEncoder(categorical_features=[0])
print(one_encoder.fit_transform(x).toarray())
# this one-hot is more readable
print(pd.get_dummies(df[['price', 'color', 'size']]))
예제 #44
0
def main():

    columns = [
        'AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r',
        'AU10_r', 'AU12_r', 'AU14_r', 'AU15_r', 'AU17_r', 'AU23_r', 'AU25_r',
        'AU26_r', 'AU45_r', 'culture', 'emotion'
    ]
    df = pd.read_csv("../old_data/videos_relabelled.csv")

    #extracting total set for training and testing sets
    # training and testing on only NA and Persian culture
    df = df[(df['culture'] == 'North America') |
            (df['culture'] == 'Philippines')]
    # training and testing on only NA and Philippines culture
    #df = df[(df['culture']  == 'North America') | (df['culture']  == 'Philippines')]

    #df = df[df['culture'] == 'Persian'] #training and testing on only persian culture
    #df = df[df['culture'] == 'Philippines'] #training and testing on only philipines culture
    #df = df[df['culture'] == 'North America'] #training and testing on only NA culture

    #df['culture_code'] = df['culture'].astype('category').cat.codes

    ############# testing model by selecting specific videos to test so components of video are not in training set ###############
    validation_array = []
    test_array = []
    vf_score = []
    tf_score = []
    kfold = KFold(5, True, 1)
    videos = df['filename'].unique()
    print(videos)
    ##  Roya: Add label encoder to convert labels to integer
    le = LabelEncoder()
    #this part is for extracting what to test on
    #this is testing set for NA culture
    #test_df = df[(df['filename'] == 'contempt_38') | (df['filename'] == 'contempt_39') | (df['filename'] == 'anger_26') | (df['filename'] == 'anger_27') | (df['filename'] == 'disgust_20') | (df['filename'] == 'disgust_21')  ]

    #this is testing set for Persian culture
    # test_df = df[(df['filename'] == '40') | (df['filename'] == '42') | (df['filename'] == '77') | (df['filename'] == '36') | (df['filename'] == '38') | (df['filename'] == '41')  ]

    #this is testing set for Filipino culture
    #test_df = df[(df['filename'] == 'contempt_25_p') | (df['filename'] == 'contempt_18_p') | (df['filename'] == 'anger_17_p') | (df['filename'] == 'anger_6_p') | (df['filename'] == 'disgust_7_p') | (df['filename'] == 'disgust_8_p')  ]

    #this is for testing on all of Filipino culture
    #test_df = df[df['culture'] == 'Philippines']

    #this is for testing on all of Persian culture
    ## Roya: add a test dataframe for displaying results
    test_df = df[df['culture'] == 'Philippines']

    test_videos = test_df['filename'].unique()
    df = df[~df['filename'].isin(list(test_videos))]
    videos = np.array(list(set(videos) - set(test_videos)))
    splits = kfold.split(videos)
    test_df_copy = test_df.drop([
        'frame', 'face_id', 'culture', 'filename', 'emotion', 'confidence',
        'success'
    ],
                                axis=1)
    for (i, (train, test)) in enumerate(splits):
        print('%d-th split: train: %d, test: %d' %
              (i + 1, len(videos[train]), len(videos[test])))
        train_df = df[df['filename'].isin(videos[train])]
        # test_df = df[df['filename'].isin(videos[test])]
        y = train_df['emotion'].values
        X = train_df.drop(columns=[
            'success', 'confidence', 'face_id', 'frame', 'emotion', 'culture',
            'filename', 'talking', 'gender'
        ]).values
        ## Change labels to int using a label encoder
        Y = le.fit_transform(y)

        X_train, X_valid, y_train, y_valid = train_test_split(X, Y)

        #print(X_train)
        print('LABEL ENCODER CLASSES: ', le.classes_)
        clf, score, fscore = create_svm(X_train, X_valid, y_train, y_valid)
        validation_array.append(score)
        vf_score.append(fscore)
        #cv_scores = cross_validate(clf, X, y, cv = 10)
        #print(cv_scores)
        # print(test_df[['frame','filename','culture','emotion']].head())

        int_test = test_df.drop(columns=[
            'success', 'confidence', 'face_id', 'frame', 'emotion', 'culture',
            'filename', 'talking', 'gender'
        ]).values
        # print(len(int_test))
        ## Roya: change string labels to integer values
        int_predict = le.fit_transform(test_df['emotion'].values)
        # print(len(int_predict))
        # predictions = clf.predict(int_test)
        predictions = clf.predict(int_test)  #integers predicted
        ## Roya: change integer labels to string values
        test_df['predicted'] = le.inverse_transform(predictions)
        ## Roya: calculate confusion matrix
        cf_matrix = confusion_matrix(test_df['emotion'].values,
                                     test_df['predicted'].values)
        print('CONFUSION MATRIX:\n', cf_matrix)

        df_cm = pd.DataFrame(cf_matrix,
                             index=le.inverse_transform([0, 1, 2]),
                             columns=le.inverse_transform([0, 1, 2]))
        df_cm = df_cm.div(df_cm.sum(axis=1), axis=0)
        ## Plot Confusion matrix
        plt.figure(figsize=(9, 6))
        sn.heatmap(df_cm, annot=True, fmt='.0%')
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.show()
        # print("predictions: ", predictions[0:10])
        # print("int_predict: ", int_predict[0:10])
        print(accuracy_score(int_predict, predictions))
        fscore = f1_score(le.fit_transform(int_predict),
                          predictions,
                          average='macro')
        test_df.drop(columns=['predicted'], inplace=True)
        print('\n')

        test_array.append(accuracy_score(int_predict, predictions))
        tf_score.append(fscore)

    print("Average accuracy for all Folds on valid dataset: " +
          str(np.mean(validation_array)))

    print("Average accuracy for all Folds on test dataset: " +
          str(np.mean(test_array)))

    print("Average f-score for all Folds on valid dataset: " +
          str(np.mean(vf_score)))

    print("Average f-score for all Folds on test dataset: " +
          str(np.mean(tf_score)))
예제 #45
0
class BuildingAdapterInterface(Inferencer):

    def __init__(self,
                 target_building,
                 target_srcids,
                 source_buildings,
                 pgid=pgid,
                 config={},
                 load_from_file=1
                 ):
        super(BuildingAdapterInterface, self).__init__(
            target_building=target_building,
            source_buildings=source_buildings,
            target_srcids=target_srcids,
            pgid=pgid,
        )

        #gather the source/target data and name features, labels
        #TODO: handle multiple source buildings
        self.stop_predict_flag = False

        if 'source_time_ranges' in config:
            self.source_time_ranges = config['source_time_ranges']
            assert len(self.source_time_ranges) == len(source_buildings)
        else:
            self.source_time_ranges = [(None, None)]\
                * len(source_buildings)
        if 'target_time_range' in config:
            self.target_time_range = config['target_time_range']
        else:
            self.target_time_range = (None, None)

        if 'threshold' in config:
            self.threshold = config['threshold']
        else:
            self.threshold = 0.5

        source_building = source_buildings[0]

        if not load_from_file:
            #data features
            source_ids, train_fd = get_data_features(source_building,
                                                     self.source_time_ranges[0][0],
                                                     self.source_time_ranges[0][1],
                                                     pgid=self.pgid,
                                                     )
            target_ids, test_fd = get_data_features(target_building,
                                                    self.target_time_range[0],
                                                    self.target_time_range[1],
                                                    pgid=self.pgid,
                                                    )

            #name features, labels
            source_res = get_namefeatures_labels(source_building, pgid=self.pgid)
            train_label = [source_res[srcid][1] for srcid in source_ids]

            self.target_res = get_namefeatures_labels(target_building, pgid=self.pgid)
            test_fn = np.asarray( [self.target_res[tgtid][0] for tgtid in target_ids] )
            test_label = [self.target_res[tgtid][1] for tgtid in target_ids]

            #find the label intersection
            intersect = list( set(test_label) & set(train_label) )
            print ('intersected tagsets:', intersect)

            #preserve the intersection, get ids for indexing data feature matrices
            if intersect:
                train_filtered = [[i,j] for i,j in enumerate(train_label) if j in intersect]
                train_id, train_label = [list(x) for x in zip(*train_filtered)]
                test_filtered = [[i,j,k] for i,(j,k) in enumerate(zip(test_label,target_ids)) if j in intersect]
                self.test_id, test_label, self.test_srcids = [list(x) for x in zip(*test_filtered)]
            else:
                raise ValueError('no common labels!')

            self.train_fd = train_fd[train_id, :]
            self.test_fd = test_fd[self.test_id, :]
            self.test_fn = test_fn[self.test_id, :]

            print ('%d training examples left'%len(self.train_fd))
            print ('%d testing examples left'%len(self.test_fd))

            self.le = LE()
            self.le.fit(intersect)
            self.train_label = self.le.transform(train_label)
            self.test_label = self.le.transform(test_label)

            res = [self.train_fd, self.test_fd, self.train_label, self.test_label, self.test_fn, self.test_srcids, self.target_res, self.le]
            with open('./%s-%s.pkl'%(source_building,target_building), 'wb') as wf:
                pk.dump(res, wf)

        else:
            print ('loading from prestored file')
            with open('./%s-%s.pkl'%(source_building,target_building), 'rb') as rf:
                res = pk.load(rf)
            self.train_fd, self.test_fd, self.train_label, self.test_label, self.test_fn, self.test_srcids, self.target_res, self.le = \
            res[0], res[1], res[2], res[3], res[4], res[5], res[6], res[7]


        print ( '# of classes:', len(set(self.train_label)) )
        print ( 'data features for %s with dim:'%source_building, self.train_fd.shape)
        print ( 'data features for %s with dim:'%target_building, self.test_fd.shape)


        self.learner = transfer_learning(
            self.train_fd,
            self.test_fd,
            self.train_label,
            self.test_label,
            self.test_fn,
            threshold = self.threshold
        )

        self.run_auto()


    def predict(self, target_srcids, verbose=False):
        '''
        return: tagset, srcid, and confidence of each labeled example
        '''
        if self.stop_predict_flag:
            self.pred_g = self.new_graph(empty=True)
            self.prior_confidences = {}
            return self.pred_g

        preds, labeled_set, confidence = self.learner.predict()
        srcids = [self.test_srcids[i] for i in labeled_set]
        tagsets = list(self.le.inverse_transform(preds))
        names = [self.target_res[i][-1] for i in srcids]

        if verbose:
            for i,j,k,l in zip(srcids, names, tagsets, confidence):
                print ('srcid %s with name %s got label %s with s %.4f'%(i,j,k,l))

        self.stop_predict_flag = True
        self.pred_g = self.new_graph(empty=True)

        acc_with_high_conf = 0
        cnt_with_high_conf = 0
        for srcid, tagset, prob in zip(srcids, tagsets, confidence):
            self._add_pred_point_result(self.pred_g, srcid, tagset, prob)

        #return srcids, tagsets, confidence
        return self.pred_g

    def run_auto(self):
        self.learner.run_auto()

    def select_informative_samples(self, sample_num):
        super(BuildingAdapterInterface, self)\
            .select_informative_samples(sample_num)
        return []
예제 #46
0
test_file = "./data/test.csv"

train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)

# This tells us which columns have null values
train_df.isnull().any(axis=0)

# Only 2 null Embarked values... let's drop it
train_df['Embarked'].isnull().sum()
train_df = train_df[train_df['Embarked'].notnull()]

le = LabelEncoder()
le.fit(['A', 'B', 'C', 'D'])

train_df['Pclass'] = le.inverse_transform(train_df['Pclass'])

# Fill in missing values for age using linear interpolation
train_df['Age'] = train_df['Age'].interpolate()

predictor_columns = [
    'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Age'
]
#predictor_columns = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked', 'Age']
label_column = 'Survived'

X = pd.get_dummies(train_df[predictor_columns])
y = train_df[label_column]

X_train, X_test, y_train, y_test = train_test_split(X, y)
예제 #47
0
#<<<`
#building sparse matrix on test data
X_test = CV_test.fit_transform(test_attr_list).toarray()
#
#>>>Predicting the Y values in the test data, model building
RFC = RandomForestClassifier(n_estimators=300,
                             criterion='entropy',
                             random_state=0)
RFC.fit(X_train, Y_train)
Y_test = RFC.predict(X_test)
#<<<
#Finding Accuracies
kfold = KFold(n_splits=10, random_state=0)
test_accuracies = cross_val_score(estimator=RFC, X=X_test, y=Y_test, cv=kfold)
print("Test Accuracies = ", test_accuracies)
print("Test Data Accuracies SD = ", test_accuracies.std())
print("Test Data Accuracies mean = ", test_accuracies.mean())
#
#>>> Coverting numeric to labels
Y_test = labelencoder.inverse_transform(Y_test)
#
#>>> adding the predicted values to the test data as label column
Y_test = np.reshape(Y_test, (len(Y_test), 1))
test_data1 = np.append(arr=test_data, values=Y_test, axis=1)
test_data1 = pd.DataFrame(test_data1,
                          columns=["id", "additionalAttributes", "labels"])
#<<<
#>>>Writeing Testing set to submissions file
with open('submissions.csv', 'w') as outfile:
    test_data1.to_csv(outfile, index=False)
#<<<
class AdaBoostClassifier(Component):
    """Text classifier using the sklearn framework"""

    name = "AdaBoost_Classifier"

    provides = ["classifylabel", "classifylabel_ranking"]

    requires = ["sentence_embedding"]

    def __init__(self, config=None, clf=None, le=None):
        # type: (sklearn.model_selection.GridSearchCV, sklearn.preprocessing.LabelEncoder) -> None
        """Construct a new classifier using the sklearn framework."""
        from sklearn.preprocessing import LabelEncoder

        if le is not None:
            self.le = le
        else:
            self.le = LabelEncoder()
        self.clf = clf

    @classmethod
    def required_packages(cls):
        # type: () -> List[Text]
        return ["numpy", "sklearn"]

    def transform_labels_str2num(self, labels):
        # type: (List[Text]) -> np.ndarray
        """Transforms a list of strings into numeric label representation.
        :param labels: List of labels to convert to numeric representation"""

        return self.le.fit_transform(labels)

    def transform_labels_num2str(self, y):
        # type: (np.ndarray) -> np.ndarray
        """Transforms a list of strings into numeric label representation.
        :param y: List of labels to convert to numeric representation"""

        return self.le.inverse_transform(y)

    def train(self, training_data, config, **kwargs):
        # type: (TrainingData, RasaNLUConfig, **Any) -> None
        """Train the classifier on a data set.
        :param num_threads: number of threads used during training time"""
        from sklearn.model_selection import GridSearchCV
        from sklearn.ensemble import AdaBoostClassifier
        labels = [e.get("label") for e in training_data.classify_examples]
        if len(set(labels)) < 2:
            logger.warning(
                "Can not train an classifier. Need at least 2 different classes. "
                + "Skipping training of classifier.")
        else:
            y = self.transform_labels_str2num(labels)
            # TODO fix it, in future sentence will replaced by "features"
            X = np.stack([
                example.get("sentence_embedding")
                for example in training_data.classify_examples
            ])
            self.clf = AdaBoostClassifier()
            #            sklearn_config = config.get("classifier_sklearn")
            #            C = sklearn_config.get("C", [1, 2, 5, 10, 20, 100])
            #            kernel = sklearn_config.get("kernel", "linear")
            #            # dirty str fix because sklearn is expecting str not instance of basestr...
            #            tuned_parameters = [{"C": C, "kernel": [str(kernel)]}]
            #            cv_splits = max(2, min(MAX_CV_FOLDS, np.min(np.bincount(y)) // 5))  # aim for 5 examples in each fold
            #
            #            self.clf = GridSearchCV(SVC(C=1, probability=True, class_weight='balanced'),
            #                                    param_grid=tuned_parameters, n_jobs=config["num_threads"],
            #                                    cv=cv_splits, scoring='f1_weighted', verbose=1)
            self.clf.fit(X, y)

    def process(self, message, **kwargs):
        # type: (Message, **Any) -> None
        """Returns the most likely label and its probability for the input text."""

        if not self.clf:
            # component is either not trained or didn't receive enough training data
            label = None
            label_ranking = []
        else:
            X = message.get("sentence_embedding").reshape(1, -1)
            label_ids, probabilities = self.predict(X)
            labels = self.transform_labels_num2str(label_ids)
            # `predict` returns a matrix as it is supposed
            # to work for multiple examples as well, hence we need to flatten
            labels, probabilities = labels.flatten(), probabilities.flatten()

            if labels.size > 0 and probabilities.size > 0:
                ranking = list(
                    zip(list(labels),
                        list(probabilities)))[:CLASSIFY_RANKING_LENGTH]
                label = {"name": labels[0], "confidence": probabilities[0]}
                label_ranking = [{
                    "name": label_name,
                    "confidence": score
                } for label_name, score in ranking]
            else:
                label = {"name": None, "confidence": 0.0}
                label_ranking = []

        message.set("classifylabel", label, add_to_output=True)
        message.set("classifylabel_ranking", label_ranking, add_to_output=True)

    def predict_prob(self, X):
        # type: (np.ndarray) -> np.ndarray
        """Given a bow vector of an input text, predict the classify label. Returns probabilities for all labels.
        :param X: bow of input text
        :return: vector of probabilities containing one entry for each label"""

        return self.clf.predict_proba(X)

    def predict(self, X):
        # type: (np.ndarray) -> Tuple[np.ndarray, np.ndarray]
        """Given a bow vector of an input text, predict most probable label. Returns only the most likely label.
        :param X: bow of input text
        :return: tuple of first, the most probable label and second, its probability"""

        pred_result = self.predict_prob(X)
        # sort the probabilities retrieving the indices of the elements in sorted order
        sorted_indices = np.fliplr(np.argsort(pred_result, axis=1))
        return sorted_indices, pred_result[:, sorted_indices]

    @classmethod
    def load(cls,
             model_dir=None,
             model_metadata=None,
             cached_component=None,
             **kwargs):
        # type: (Text, Metadata, Optional[Component], **Any) -> SklearnClassifier
        import cloudpickle

        if model_dir and model_metadata.get("classifier_sklearn"):
            classifier_file = os.path.join(
                model_dir, model_metadata.get("classifier_sklearn"))
            with io.open(classifier_file, 'rb') as f:  # pragma: no test
                return cloudpickle.load(f, encoding="latin-1")
        else:
            return SklearnClassifier()

    def persist(self, model_dir):
        # type: (Text) -> Dict[Text, Any]
        """Persist this model into the passed directory. Returns the metadata necessary to load the model again."""

        import cloudpickle

        classifier_file = os.path.join(model_dir, "label_classifier.pkl")
        with io.open(classifier_file, 'wb') as f:
            cloudpickle.dump(self, f)

        return {"classifier_sklearn": "label_classifier.pkl"}
예제 #49
0
    "pm/text-ml-classification/scripts/result_non_split.pkl")
trainDf = pd.read_pickle(
    "/home/zhen.di/pm/text-ml-classification/scripts/trainDf.pkl")

# split data into training data and testing data
X_train, X_test, y_train, y_test = train_test_split(trainDf,
                                                    result.Class,
                                                    test_size=0.2,
                                                    random_state=5,
                                                    stratify=result.Class)

# encode labels
le = LabelEncoder()
y_train = le.fit_transform(y_train)  # int64
y_test = le.transform(y_test)
encoded_test_y = np_utils.to_categorical((le.inverse_transform(y_test)))


def plot_confusion_matrix(clf, test_y, predict_y):
    """ Give confusion matrix based on testing data """

    C = confusion_matrix(test_y, predict_y)

    labels = le.classes_
    fig = plt.figure(figsize=(10, 8))
    sns.heatmap(C,
                annot=True,
                cmap="Blues",
                fmt=".0f",
                xticklabels=labels,
                yticklabels=labels)
X_train = embedded[train_idx]
# 50 test examples of 10 identities (5 examples each)
X_test = embedded[test_idx]

y_train = y[train_idx]
y_test = y[test_idx]

knn = KNeighborsClassifier(n_neighbors=1, metric='euclidean')
svc = LinearSVC()

knn.fit(X_train, y_train)
svc.fit(X_train, y_train)

acc_knn = accuracy_score(y_test, knn.predict(X_test))
acc_svc = accuracy_score(y_test, svc.predict(X_test))

print('KNN accuracy = {}, SVM accuracy = {}'.format(acc_knn,acc_svc))

import warnings
# Suppress LabelEncoder warning
warnings.filterwarnings('ignore')

example_idx = 15

example_image = load_image(metadata[test_idx][example_idx].image_path())
example_prediction = svc.predict([embedded[test_idx][example_idx]])
example_identity = encoder.inverse_transform(example_prediction)[0]

plt.imshow(example_image)
plt.title('Recognized as {}'.format(example_identity))
plt.show()
예제 #51
0
class MLPAgent():
    def __init__(self,
                 hidden_layer_sizes=(100,),
                 activation="relu",
                 solver="lbfgs",  # faster and better than adam for small data http://scikit-learn.org/stable/modules/neural_networks_supervised.html#tips-on-practical-use
                 max_iter=200,
                 verbose=True,
                 early_stopping=False,
                 ngram_range=(1, 1),
                 max_features=None):
        self.vect = TfidfVectorizer(
            tokenizer=word_tokenize,
            ngram_range=ngram_range,
            max_features=max_features)
        self.mlp = MLPClassifier(
            hidden_layer_sizes=hidden_layer_sizes,
            activation=activation,
            solver=solver,
            max_iter=max_iter,
            #early_stopping=early_stopping,
            verbose=verbose)
        self.enc = LabelEncoder()



    def __str__(self):
        return "{}-layer MLP-Agent {}-grams".format(self.mlp.hidden_layer_sizes,
                                                self.vect.ngram_range)
            #"{:d}-NN Agent {}".format(self.get_params())

    def get_params(self):
        """Get parameters< for this estimator.
        -------
        params : plain flushed dict of parameters
        """

        # dict = {}
        # for object_name, values in self.__dict__:
        #     dict[object_name]


        return self.__dict__

    def fit(self, X, y):
        """
        X : question
        y : utterances
        >>> x = 3
        >>> x == 3
        True
        """
        #X,y = X[:5000], y[:5000]
        print("classifier.fit: X.shape", X.shape,"y.shape", y.shape)

        # learns vectorizer on utterances and answers
        self.vect.fit(np.append(X, y))
        # transforms the inputs to vectorized form
        questions_vec = self.vect.transform(X)
        #answers_vec = self.vect.transform(y)
        #TODO: replace answer values with labels

        answers_vec = self.enc.fit_transform(y)

        self.mlp.fit(questions_vec,answers_vec)
        print("Fitting MLP")

        print('n_samples', X.shape[0])
        print('vocabulary size', len(self.vect.vocabulary_))
        print('targets', answers_vec.shape)

    def predict(self, question):
        vector_question = self.vect.transform([question])
        p = self.mlp.predict_proba(vector_question)[0]  # only one question
        # [p_0, p_1, p_2]
        labels = p.nonzero()[0]
        ind = np.argsort(p[labels])[::-1]
        #print("p ",p," labels ",labels, " ind ",ind)
        # use cluster_ind -> medoid mapping
        # inverse transform medoid
        return (self.enc.inverse_transform(labels[ind]), p[labels[ind]])
예제 #52
0
paramCheck = '(' + str(MAX_DEPTH) + ',' + str(ETA) + ',' + str(
    num_round) + ',' + str(SUB_SAMPLE) + ',' + str(COL_SAMPLE) + ')'
timeStr = str(time.strftime('%Y-%m-%d %H%M%S', time.localtime()))
bst.save_model(MODEL_FOLDER + 'model' + timeStr + str(cvndcg) + paramCheck +
               str(param['seed']) + '.model')
bst.dump_model(MODEL_FOLDER + timeStr + "dump.raw.txt")
fscore = bst.get_fscore()
sorted_fscore = sorted(fscore.items(),
                       key=operator.itemgetter(1),
                       reverse=True)
print fscore
print sorted_fscore
ypred = bst.predict(dtest)
# print ypred[:5]
# Taking the 5 classes with highest probabilities
ids = []  # list of ids
cts = []  # list of countries
for i in range(len(ypred)):
    idx = idSave[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(ypred[i])[::-1])[:5].tolist()

# Generate submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
stTimeStr = '(' + str(startTime)[11:19] + 'start)'
timeSpend = (dt.now() - startTime)

print timeSpend, stTimeStr
sub.to_csv(OUT_FOLDER + 'sub' + timeStr + paramCheck + '.csv', index=False)
class flask_serving_classifier(Component):
    """Intent classifier using the sklearn framework"""

    name = "flask_serving_classifier"

    provides = ["intent", "intent_ranking"]

    requires = ["text_features"]



    def __init__(self,
                 component_config=None,  # type: Dict[Text, Any]
                 clf=None,  # type: sklearn.model_selection.GridSearchCV
                 le=None  # type: sklearn.preprocessing.LabelEncoder
                 ):
        # type: (...) -> None
        """Construct a new intent classifier using the sklearn framework."""
        from sklearn.preprocessing import LabelEncoder

        super(flask_serving_classifier, self).__init__(component_config)

        if le is not None:
            self.le = le
        else:
            self.le = LabelEncoder()
        self.clf = clf

        _sklearn_numpy_warning_fix()

    @classmethod
    def required_packages(cls):
        # type: () -> List[Text]
        return ["sklearn"]

    def transform_labels_str2num(self, labels):
        # type: (List[Text]) -> np.ndarray
        """Transforms a list of strings into numeric label representation.

        :param labels: List of labels to convert to numeric representation"""

        return self.le.fit_transform(labels)

    def transform_labels_num2str(self, y):
        # type: (np.ndarray) -> np.ndarray
        """Transforms a list of strings into numeric label representation.

        :param y: List of labels to convert to numeric representation"""

        return self.le.inverse_transform(y)

    def train(self, training_data, cfg, **kwargs):
        # type: (TrainingData, RasaNLUModelConfig, **Any) -> None
        """Train the intent classifier on a data set."""
        logger.warn("ED CLASSIFIER TRAIN")
        num_threads = kwargs.get("num_threads", 1)

        labels = [e.get("intent")
                  for e in training_data.intent_examples]

        if len(set(labels)) < 2:
            logger.warn("Can not train an intent classifier. "
                        "Need at least 2 different classes. "
                        "Skipping training of intent classifier.")
        else:
            y = self.transform_labels_str2num(labels).tolist()
#             X = np.stack([example.get("text_features")
#                           for example in training_data.intent_examples])

#             attrs = vars(training_data.intent_examples[0])
#             print(', '.join("%s: %s" % item for item in attrs.items()))
#             print('ED TRAIN DATA:', training_data.intent_examples[0])

            X = [i.text for i in training_data.intent_examples]

            categories = [i for i in set(y)]
            model_name = 'datetime'
            host = '172.17.0.5'
            port = 9000
            url = f'http://{host}:{port}/train'
            data = {'text': X, 'labels': y, 'unique_labels': categories}
            print('ED DATA', data)
            tr = requests.put(url, json=data)  ###train
            print(tr.json())
            self.clf = model_name

            # self.clf = self._create_classifier(num_threads, y)

            # self.clf.fit(X, y)


    def process(self, message, **kwargs):
        # type: (Message, **Any) -> None
        """Return the most likely intent and its probability for a message."""
        logger.warn("ED CLASSIFIER PROCESS MESSAGE:")
        if not self.clf:
            # component is either not trained or didn't
            # receive enough training data
            intent = None
            intent_ranking = []
        else:
            print('ED message', message)
            #
            # attrs = vars(message)
            # print(', '.join("%s: %s" % item for item in attrs.items()))

#             X = message.get("text_features").reshape(1, -1)

            X = message.text
#             X = message.get('text)
#             X = message.data.text
            intent_ids, probabilities = self.predict(X)


            intents = self.transform_labels_num2str(np.ravel(intent_ids))
            # `predict` returns a matrix as it is supposed
            # to work for multiple examples as well, hence we need to flatten
            probabilities = probabilities.flatten()

            if intents.size > 0 and probabilities.size > 0:
                ranking = list(zip(list(intents),
                                   list(probabilities)))[:INTENT_RANKING_LENGTH]

                intent = {"name": intents[0], "confidence": probabilities[0]}

                intent_ranking = [{"name": intent_name, "confidence": score}
                                  for intent_name, score in ranking]
            else:
                intent = {"name": None, "confidence": 0.0}
                intent_ranking = []

        message.set("intent", intent, add_to_output=True)
        message.set("intent_ranking", intent_ranking, add_to_output=True)

    def predict_prob(self, X):
        # type: (np.ndarray) -> np.ndarray
        """Given a bow vector of an input text, predict the intent label.

        Return probabilities for all labels.

        :param X: bow of input text
        :return: vector of probabilities containing one entry for each label"""
        data = {'text': X,'labels':[], 'unique_labels':[]}
        host = '172.17.0.5'
        port = 9000
        url = f'http://{host}:{port}/predict'
        pred = requests.post(url, json=data)
        out = np.array(pred.json()['prediction'])
        return out

    def predict(self, X):
        # type: (np.ndarray) -> Tuple[np.ndarray, np.ndarray]
        """Given a bow vector of an input text, predict most probable label.

        Return only the most likely label.

        :param X: bow of input text
        :return: tuple of first, the most probable label and second,
                 its probability."""

        pred_result = self.predict_prob(X)
        # sort the probabilities retrieving the indices of
        # the elements in sorted order
        sorted_indices = np.fliplr(np.argsort(pred_result, axis=1))
        return sorted_indices, pred_result[:, sorted_indices]

    @classmethod
    def load(cls,
             model_dir=None,  # type: Optional[Text]
             model_metadata=None,  # type: Optional[Metadata]
             cached_component=None,  # type: Optional[Component]
             **kwargs  # type: **Any
             ):
        # type: (...) -> SklearnIntentClassifier

        meta = model_metadata.for_component(cls.name)
        file_name = meta.get("classifier_file", SKLEARN_MODEL_FILE_NAME)
        classifier_file = os.path.join(model_dir, file_name)

        if os.path.exists(classifier_file):
            return utils.pycloud_unpickle(classifier_file)
        else:
            return cls(meta)

    def persist(self, model_dir):
        # type: (Text) -> Optional[Dict[Text, Any]]
        """Persist this model into the passed directory."""

        classifier_file = os.path.join(model_dir, SKLEARN_MODEL_FILE_NAME)
        utils.pycloud_pickle(classifier_file, self)
        return {"classifier_file": SKLEARN_MODEL_FILE_NAME}
예제 #54
0
import cv2
import pickle
import argparse
import numpy as np
from sklearn.preprocessing import LabelEncoder

ap = argparse.ArgumentParser()
ap.add_argument("-s", "--saved", required=True, help="Path of saved model")
ap.add_argument("-f", "--flower", required=True, help="Path of image")
ap.add_argument("-m", "--mask", required=True, help="Path of mask")
args = vars(ap.parse_args())

model = pickle.load(open(args["saved"], 'rb'))
flower = cv2.imread(args["flower"])
mask = cv2.imread(args["mask"])
gray_mask = cv2.cvtColor(mask, cv2.COLOR_BGR2GRAY)

label_encoder = LabelEncoder()
label_encoder.classes_ = np.load('classes.npy')
hist = cv2.calcHist([flower], [0, 1, 2], gray_mask, [8, 8, 8], [0, 256, 0, 256, 0, 256])
cv2.normalize(hist, hist)
flower_class = label_encoder.inverse_transform(model.predict([hist.flatten()]))[0]

print(flower_class)
cv2.imshow("flower", flower)
cv2.waitKey(0)
for i in idxs:
	# load the testing image, clone it, and resize it
	image = cv2.imread(testingPaths[i])
	output = image.copy()
	output = cv2.resize(output, (128, 128))

	# pre-process the image in the same manner we did earlier
	image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
	image = cv2.resize(image, (200, 200))
	image = cv2.threshold(image, 0, 255,
		cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]

	# quantify the image and make predictions based on the extracted
	# features using the last trained Random Forest
	features = quantify_image(image)
	preds = model.predict([features])
	label = le.inverse_transform(preds)[0]

	# draw the colored class label on the output image and add it to
	# the set of output images
	color = (0, 255, 0) if label == "healthy" else (0, 0, 255)
	cv2.putText(output, label, (3, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
		color, 2)
	images.append(output)

# create a montage using 128x128 "tiles" with 5 rows and 5 columns
montage = build_montages(images, (128, 128), (5, 5))[0]

# show the output montage
cv2.imshow("Output", montage)
cv2.waitKey(0)
예제 #56
0
class RecommenderDeepNN:
    '''
    Recommender for Yelp dataset using the deepFM model.

    Parameters
    ----------
    category: 'restaurants', Keep only businesses of a certain category
        - Options: 'restaurants', 'automotive', 'shopping'
    min_review: 5, Keep only business with more review_count than this value
    min_category: 50, Keep only categories that apply to more than this amount of businesses
    weight:  False, Whether or not to use weights for the attribute matrix in the DeepFM
    scaler: 'minmax', Scaler for dense features
    optimizer: "adam", Optimizer for the DeepFM
    loss: 'mse', Loss function for the DeepFM
    batch_size: 256, 
    epochs: 10, 
    train_size: 0.8,
    deepfm__dnn_hidden_units: (128, 128),
    deepfm__l2_reg_linear: 1e-05,
    deepfm__l2_reg_embedding: 1e-05,
    deepfm__l2_reg_dnn: 0,
    deepfm__seed: 1024,
    deepfm__dnn_dropout: 0,
    deepfm__dnn_activation: 'relu'

    Example
    -------
    deepnn = RecommenderDeepNN(deepfm__seed=2048)
    deepnn.load_data(config.JSON_BUSINESS, config.CSV_RATINGS)
    deepnn.fit()
    deepnn.topN(260, n=5)

    deepnn = RecommenderDeepNN(scaler='standard', train_size=0.99)    
    deepnn.fit(config.JSON_BUSINESS, config.CSV_RATINGS)
    '''    

    def __init__(self, **kwargs):
        '''
        Parameters
        ----------
        path_business: Path to the business.json file that contains 'attributes' and 'catogories' as dictionaries for all businesses
        path_ratings: Path to the ratings.csv file that contains 'user_id', 'business_id' and 'stars'. The review text is not needed here.
        '''
        self.path_business = ""
        self.path_ratings = ""
        self.features_sparse = features_sparse
        self.features_dense = features_dense
        
        self.params = params_deepnn
        self.params_deepfm = {}
        self.business = None
        self.data = None

        self.attr2index = {}
        self.raw_to_iid = {}
        self.iid_to_raw = {}
        self.raw_to_uid = {}
        self.uid_to_raw = {}

        # Label encoders
        self.lbe_user = None
        self.lbe_item = None

        self.model = None
        self.features_linear = []
        self.features_dnn = []
        self.model_input = {}


        self.update_params(**kwargs)

    def load_data(self, path_business, path_ratings):
        '''
        Load data and transform it to usable format.
        '''
        print("Loading data ...")
        
        self.path_business = path_business
        self.path_ratings = path_ratings
        
        df = pd.read_json(self.path_business, lines=True, encoding='utf-8')
        df_ratings = pd.read_csv(self.path_ratings)
        df_ratings.rename({'stars':'rating'}, axis=1, inplace=True)

        to_keep = config.Keywords_Categories[self.params['category']]
        keeprows = utils.filter_business_with_categories(df, to_keep)
        df = df[keeprows]

        # Map user_id and business_id encodings to integers
        self.uid_to_raw = dict(df_ratings['user_id'].drop_duplicates().reset_index()['user_id'])
        self.raw_to_uid = {k:v for v, k in self.uid_to_raw.items()}
        self.iid_to_raw = dict(df['business_id'])
        self.raw_to_iid = {k:v for v, k in self.iid_to_raw.items()}

        self.business = df[['business_id', 'name', 'stars', 'review_count', 'categories']]

        df = df[df['review_count'] > self.params['min_review']]
        df = df_ratings.join(df[['business_id', 'stars', 'review_count', 'categories']].set_index('business_id'), on='business_id', how='right')
        # Has to be "right"... otherwise there will be NaNs
        # Also, use df.set_index() because df is smaller in size

        df['user_id'] = df['user_id'].map(self.raw_to_uid)
        df['business_id'] = df['business_id'].map(self.raw_to_iid)
        
        self.lbe_user = LabelEncoder()
        self.lbe_item = LabelEncoder()
        df['user_id'] = self.lbe_user.fit_transform(df['user_id'])
        df['business_id'] = self.lbe_item.fit_transform(df['business_id'])
        # x = lbe_user.inverse_transform(df_ratings['user_id'])
        # y = lbe_item.inverse_transform(df_ratings['business_id'])
        
        if(self.params['scaler'] == 'minmax'):
            scaler = MinMaxScaler(feature_range=(0,1))
        elif(self.params['scaler'] == 'standard'):
            scaler = StandardScaler()
        df[self.features_dense] = scaler.fit_transform(df[self.features_dense])

        lbe = LabelEncoder()
        for var in self.features_sparse:
            if(var not in ['business_id', 'user_id']):
                df[var] = lbe.fit_transform(df[var])

        self.data = df
        
        del df, df_ratings

    def _compile_business_categories(self, df_business):
        '''
        Find all the categories that apply to the businesses in the DataFrame df_business
        '''
        categories = Counter()
        for line in df_business['categories']:
            if(isinstance(line, str)):
                categories.update(re.split(', ', line))
        categories = pd.DataFrame.from_dict(categories, orient='index', columns=['count'])
        return categories

    def _build_category_dict(self, drop_categories=[]):
        attrs = self._compile_business_categories(self.data)
        attrs = attrs[attrs['count'] > self.params['min_category']].sort_values(by='count', ascending=False)
        for cat in drop_categories:
            attrs.drop(cat, inplace=True)
        attrs.index.to_list()
        self.attr2index = {k:v+1 for v, k in enumerate(attrs.index.to_list())}
        del attrs

    def _category_vectorizer(self, x):
        '''
        Label encode categories of any business x into a list of indices. The mapping is given by the dictionary attr2index{catogory:index}.
        '''
        if(isinstance(x, str)):
            spt = re.split(', ', x)
            return list(map(lambda x: self.attr2index[x] if x in self.attr2index else 0, spt))
        else: return []

    def _get_category_matrix(self, df):
        attrs_matrix = [self._category_vectorizer(x) for x in df['categories'].values]
        attrs_max_len = max(np.array(list(map(len, attrs_matrix))))
        attrs_matrix = pad_sequences(attrs_matrix, maxlen=attrs_max_len, padding='post',)

        print("Matrix takes {:5.2f} MB".format(attrs_matrix.nbytes/1024./1024.))
        return attrs_matrix, attrs_max_len

    def _build_model(self):
        to_drop = config.Keywords_Categories[self.params['category']]
        self._build_category_dict(drop_categories=to_drop)
        attrs_matrix, attrs_max_len = self._get_category_matrix(self.data)
        
        vars_fixlen = [SparseFeat(var, self.data[var].nunique(),
                                  embedding_dim=4)
                       for var in self.features_sparse]
        vars_fixlen += [DenseFeat(var, 1,) for var in self.features_dense]
        vars_varlen = [VarLenSparseFeat(SparseFeat('categories',
                        vocabulary_size=len(self.attr2index) + 1,
                        embedding_dim=4),
                        maxlen=attrs_max_len, combiner='mean',
                        weight_name='attrs_weight' if self.params['weight'] else None)]

        self.features_linear = vars_fixlen + vars_varlen
        self.features_dnn = vars_fixlen + vars_varlen

        self.model = DeepFM(self.features_linear, self.features_dnn,
                            task='regression', **self.params_deepfm)
        return attrs_matrix, attrs_max_len

    def get_feature_names(self):
        return get_feature_names(self.features_linear + self.features_dnn)

    def _set_params_deepfm(self):
        for k, v in self.params.items():
            spt = k.split('__')
            if(len(spt) > 1): self.params_deepfm[spt[1]] = v

    def update_params(self, recompile=True, **kwargs):
        '''
        Update parameters for the recommender and re-compile the DeepFM model unless recompile is set to False.

        Example
        -------
        deepnn.update_params(epochs=20, deepfm__l2_reg_linear=2e-4)
        '''
        for (k, v) in kwargs.items():
            if(k in self.params):
                self.params[k] = v
            else:
                raise ValueError('{0} is not a valid parameter for RecommenderDeepNN.'.format(k))
        self._set_params_deepfm()
        if(recompile == True and self.model is not None):
            self.model = DeepFM(self.features_linear, self.features_dnn,
                                task='regression', **self.params_deepfm)

    def fit(self, path_business=None, path_ratings=None):
        if(self.data is None):
            self.load_data(path_business, path_ratings)

        model_input = self._get_model_input(self.data)

        self.model.compile(self.params['optimizer'],
                           self.params['loss'],
                           metrics=[self.params['loss']],)
        self.model.fit(model_input, self.data['rating'].values,
                       batch_size=self.params['batch_size'],
                       epochs=self.params['epochs'], 
                       validation_split=1-self.params['train_size'],
                       verbose=2)

    def _get_model_input(self, df):
        if(self.model is None):
            attrs_matrix, attrs_max_len = self._build_model()
        else:
            attrs_matrix, attrs_max_len = self._get_category_matrix(df)

        features = self.get_feature_names()

        model_input = {name: df[name] for name in features}
        model_input['categories'] = attrs_matrix
        if(self.params['weight']):
            model_input['attrs_weight'] = np.random.randn(df.shape[0], attrs_max_len, 1)
        return model_input

    def predictAllItemsForUser(self, uid):
        '''
        Returns predicted ratings of all businesses for any user (uid)
        '''
        df = self.data.drop_duplicates('business_id').drop('user_id', axis=1)
        df['user_id'] = uid

        model_input = self._get_model_input(df)
        pred = self.model.predict(model_input, 
                                  batch_size=self.params['batch_size'])
        return pd.DataFrame(pred,index=df['business_id'],columns=['pred'])

    def topN(self, uid, n=5):
        inner_uid = self.lbe_user.transform([uid])[0]
        pred = self.predictAllItemsForUser(inner_uid)
        topn = pred.nlargest(n, columns='pred')
        top_n_iid = self.lbe_item.inverse_transform(topn.index)
        predictions = topn['pred'].to_list()
        n_reviews = self.data['user_id'].value_counts()[inner_uid]
        print()
        print("UserID: {0},  Rated: {1}".format(uid, n_reviews))
        print("--------------------------------")
        topN_business = self.business.loc[top_n_iid]
        for i, (_, business) in enumerate(topN_business.iterrows()):
            print(business['name'])
            print(business['categories'])
            print("Pred: %4.2f  Avg: %3.1f out of %d reviews\n" % \
                  (predictions[i], business['stars'], business['review_count']))
    #load values from dataset:
    X_test = df.values[:, 3:]
    Y_test = df.values[:, 2:3].ravel()

    #DETERMINE RULESIZE BY THE NUMBER OF UNIQUE Y's
    rulesize = len(np.unique(Y_test))

    #integer encode with sklearn
    label_encoder = LabelEncoder()
    integer_encoded_Y = label_encoder.fit_transform(Y_test)
    #one hot encode with keras:
    onehot_Y_test = to_categorical(integer_encoded_Y)

    # #reverse encoding...
    target_names = np.unique(
        label_encoder.inverse_transform(argmax(onehot_Y_test, axis=1)))
    #convert to strings just in case...
    target_names = [str(i) for i in target_names]

    #def top 5, 10 accuracy:
    def top_5_categorical_accuracy(y_true, y_pred):
        return top_k_categorical_accuracy(y_true, y_pred, k=5)

    def top_10_categorical_accuracy(y_true, y_pred):
        return top_k_categorical_accuracy(y_true, y_pred, k=10)

    #save model
    #load from json
    print "Loading your model..."
    model_json = open(
        os.path.join(
예제 #58
0
    #df = df.query("rating_type == @target")  # filter rating type
    df = df.query("rating != 'Geen van allen'")

    with pd.option_context('mode.chained_assignment',
                           None):  # suppress stupid warning
        df.loc[:, 'rating'] = df.loc[:, 'rating'].replace(DU2EN)  # translate
        df_emo = df.query("rating_type == 'emotion'")
        df_emo.loc[:, 'rating'] = le.transform(df_emo['rating'])

    df_emo = convert_doubles_to_single_labels(df_emo['rating'],
                                              soft=False,
                                              keepdims=False)
    df_emo = pd.DataFrame(df_emo.values.argmax(axis=1),
                          columns=['rating'],
                          index=df_emo.index)
    df_emo['rating'] = le.inverse_transform(df_emo['rating'])
    df_emo = df_emo.rename({'rating': 'emotion'}, axis=1)

    df_tmp = df.query("rating_type == 'valence'")
    df_val = pd.DataFrame()
    for idx in df_emo.index:
        tmp = df_tmp.loc[idx, 'rating']
        if isinstance(tmp, pd.Series):
            val = df_tmp.loc[idx, 'rating'].astype(float).values.mean()
        else:
            val = tmp

        df_val.loc[idx, 'valence'] = val

    df_tmp = df.query("rating_type == 'arousal'")
    df_aro = pd.DataFrame()
예제 #59
0
class DefaultPreprocessor(AbstractPreprocessor):
    def __init__(self, config: ModelConfig, cache_home=None, use_cache=False):
        super().__init__(config)
        self.reset()
        self.X_types = None
        self.y_type = None
        self.cache_dir = self._prepare_cache_dir(cache_home)
        self.use_cache = use_cache

    def reset(self):
        self.metainfo = None
        self.categorical_columns = None
        self.var_len_categorical_columns = None
        self.continuous_columns = None
        self.y_lable_encoder = None
        self.X_transformers = collections.OrderedDict()

    def prepare_X(self, X):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
        if len(set(X.columns)) != len(list(X.columns)):
            cols = [
                item for item, count in collections.Counter(X.columns).items()
                if count > 1
            ]
            raise ValueError(f'Columns with duplicate names in X: {cols}')
        if X.columns.dtype != 'object':
            X.columns = ['x_' + str(c) for c in X.columns]
            logger.warn(f"Column index of X has been converted: {X.columns}")
        return X

    def fit_transform(self, X, y, copy_data=True):
        sign = self.get_X_y_signature(X, y)
        if self.use_cache:
            logger.info('Try to load (X, y) from cache')
            X_t, y_t = self.get_transformed_X_y_from_cache(sign)
            if X_t is not None and y_t is not None:
                if self.load_transformers_from_cache():
                    return X_t, y_t
            else:
                logger.info('Load failed')

        start = time.time()
        self.reset()
        if X is None:
            raise ValueError(f'X cannot be none.')
        if y is None:
            raise ValueError(f'y cannot be none.')
        if len(X.shape) != 2:
            raise ValueError(f'X must be a 2D datasets.')
        # if len(y.shape) != 1:
        #    raise ValueError(f'y must be a 1D datasets.')
        if X.shape[0] != y.shape[0]:
            raise ValueError(
                f"The number of samples of X and y must be the same. X.shape:{X.shape}, y.shape{y.shape}"
            )

        y_df = pd.DataFrame(y)
        if y_df.isnull().sum().sum() > 0:
            raise ValueError("Missing values in y.")

        if copy:
            X = copy.deepcopy(X)
            y = copy.deepcopy(y)

        y = self.fit_transform_y(y)

        X = self.prepare_X(X)
        X = self.__prepare_features(X)
        if self.config.auto_imputation:
            X = self._imputation(X)
        if self.config.auto_encode_label:
            X = self._categorical_encoding(X)
        if self.config.auto_discrete:
            X = self._discretization(X)
        if self.config.apply_gbm_features and y is not None:
            X = self._apply_gbm_features(X, y)
        var_len_categorical_columns = self.config.var_len_categorical_columns
        if var_len_categorical_columns is not None and len(
                var_len_categorical_columns) > 0:
            X = self._var_len_encoder(X, var_len_categorical_columns)

        self.X_transformers['last'] = PassThroughEstimator()

        cat_cols = self.get_categorical_columns()
        cont_cols = self.get_continuous_columns()
        if len(cat_cols) > 0:
            X[cat_cols] = X[cat_cols].astype('category')
        if len(cont_cols) > 0:
            X[cont_cols] = X[cont_cols].astype('float')

        logger.info(f'fit_transform taken {time.time() - start}s')

        if self.use_cache:
            logger.info('Put (X, y) into cache')
            self.save_transformed_X_y_to_cache(sign, X, y)
            self.save_transformers_to_cache()
        return X, y

    def fit_transform_y(self, y):
        if self.config.task == consts.TASK_AUTO:
            self.task_, self.labels_ = deeptable.infer_task_type(y)
        else:
            self.task_ = self.config.task
        if self.task_ in [consts.TASK_BINARY, consts.TASK_MULTICLASS]:
            self.y_lable_encoder = LabelEncoder()
            y = self.y_lable_encoder.fit_transform(y)
            self.labels_ = self.y_lable_encoder.classes_
        elif self.task_ == consts.TASK_MULTILABEL:
            self.labels_ = list(range(y.shape[-1]))
        else:
            self.labels_ = []
        return y

    def transform(self, X, y, copy_data=True):
        sign = self.get_X_y_signature(X, y)
        if self.use_cache:
            logger.info('Try to load (X, y) from cache')
            X_t, y_t = self.get_transformed_X_y_from_cache(sign)
            if X_t is not None and y_t is not None:
                return X_t, y_t
            else:
                logger.info('Load failed')

        X_t = self.transform_X(X, copy_data)
        y_t = self.transform_y(y, copy_data)

        cat_cols = self.get_categorical_columns()
        cont_cols = self.get_continuous_columns()
        if len(cat_cols) > 0:
            X_t[cat_cols] = X_t[cat_cols].astype('category')
        if len(cont_cols) > 0:
            X_t[cont_cols] = X_t[cont_cols].astype('float')

        if self.use_cache:
            logger.info('Put (X, y) into cache')
            self.save_transformed_X_y_to_cache(sign, X_t, y_t)

        return X_t, y_t

    def transform_y(self, y, copy_data=True):
        logger.info("Transform [y]...")
        start = time.time()
        if copy_data:
            y = copy.deepcopy(y)
        if self.y_lable_encoder is not None:
            y = self.y_lable_encoder.transform(y)
        logger.info(f'transform_y taken {time.time() - start}s')
        y = np.array(y)
        return y

    def transform_X(self, X, copy_data=True):
        start = time.time()
        logger.info("Transform [X]...")
        if copy_data:
            X = copy.deepcopy(X)
        X = self.prepare_X(X)
        steps = [step for step in self.X_transformers.values()]
        pipeline = make_pipeline(*steps)
        X_t = pipeline.transform(X)
        logger.info(f'transform_X taken {time.time() - start}s')
        return X_t

    def inverse_transform_y(self, y_indicator):
        if self.y_lable_encoder is not None:
            return self.y_lable_encoder.inverse_transform(y_indicator)
        else:
            return y_indicator

    def __prepare_features(self, X):
        start = time.time()

        logger.info(f'Preparing features...')
        num_vars = []
        convert2cat_vars = []
        cat_vars = []
        excluded_vars = []

        if self.config.cat_exponent >= 1:
            raise ValueError(
                f'"cat_expoent" must be less than 1, not {self.config.cat_exponent} .'
            )

        var_len_categorical_columns = self.config.var_len_categorical_columns
        var_len_column_names = []
        if var_len_categorical_columns is not None and len(
                var_len_categorical_columns) > 0:
            # check items
            for v in var_len_categorical_columns:
                if not isinstance(v, (tuple, list)) or len(v) != 3:
                    raise ValueError(
                        "Var len column config should be a tuple 3.")
                else:
                    var_len_column_names.append(v[0])
            var_len_col_sep_dict = {
                v[0]: v[1]
                for v in var_len_categorical_columns
            }
            var_len_col_pooling_strategy_dict = {
                v[0]: v[2]
                for v in var_len_categorical_columns
            }
        else:
            var_len_col_sep_dict = {}
            var_len_col_pooling_strategy_dict = {}

        unique_upper_limit = round(X.shape[0]**self.config.cat_exponent)
        for c in X.columns:
            nunique = X[c].nunique()
            dtype = str(X[c].dtype)

            if nunique <= 1 and self.config.auto_discard_unique:
                continue

            if c in self.config.exclude_columns:
                excluded_vars.append((c, dtype, nunique))
                continue

            # handle var len feature
            if c in var_len_column_names:
                self.__append_var_len_categorical_col(
                    c, nunique, var_len_col_sep_dict[c],
                    var_len_col_pooling_strategy_dict[c])
                continue

            if self.config.categorical_columns is not None and isinstance(
                    self.config.categorical_columns, list):
                if c in self.config.categorical_columns:
                    cat_vars.append((c, dtype, nunique))
                else:
                    if np.issubdtype(dtype, np.number):
                        num_vars.append((c, dtype, nunique))
                    else:
                        print(
                            f'Column [{c}] has been discarded. It is not numeric and not in [config.categorical_columns].'
                        )
            else:
                if dtype == 'object' or dtype == 'category' or dtype == 'bool':
                    cat_vars.append((c, dtype, nunique))
                elif self.config.auto_categorize and nunique < unique_upper_limit:
                    convert2cat_vars.append((c, dtype, nunique))
                else:
                    num_vars.append((c, dtype, nunique))

        if len(convert2cat_vars) > 0:
            ce = CategorizeEncoder([c for c, d, n in convert2cat_vars],
                                   self.config.cat_remain_numeric)
            X = ce.fit_transform(X)
            self.X_transformers['categorize'] = ce
            if self.config.cat_remain_numeric:
                cat_vars = cat_vars + ce.new_columns
                num_vars = num_vars + convert2cat_vars
            else:
                cat_vars = cat_vars + convert2cat_vars

        logger.debug(
            f'{len(cat_vars)} categorical variables and {len(num_vars)} continuous variables found. '
            f'{len(convert2cat_vars)} of them are from continuous to categorical.'
        )

        self.__append_categorical_cols([(c[0], c[2] + 2) for c in cat_vars])
        self.__append_continuous_cols([c[0] for c in num_vars],
                                      consts.INPUT_PREFIX_NUM + 'all')
        print(f'Preparing features taken {time.time() - start}s')
        return X

    def _imputation(self, X):
        start = time.time()
        logger.info('Data imputation...')
        continuous_vars = self.get_continuous_columns()
        categorical_vars = self.get_categorical_columns()
        var_len_categorical_vars = self.get_var_len_categorical_columns()

        transformers = [
            ('categorical',
             SimpleImputer(missing_values=np.nan,
                           strategy='constant'), categorical_vars),
            ('continuous', SimpleImputer(missing_values=np.nan,
                                         strategy='mean'), continuous_vars),
        ]

        if len(var_len_categorical_vars) > 0:
            transformers.append(
                ('var_len_categorical',
                 SimpleImputer(missing_values=np.nan, strategy='constant'),
                 var_len_categorical_vars), )

        ct = ColumnTransformer(transformers)
        dfwrapper = DataFrameWrapper(
            ct, categorical_vars + continuous_vars + var_len_categorical_vars)
        X = dfwrapper.fit_transform(X)
        self.X_transformers['imputation'] = dfwrapper
        print(f'Imputation taken {time.time() - start}s')
        return X

    def _categorical_encoding(self, X):
        start = time.time()
        logger.info('Categorical encoding...')
        vars = self.get_categorical_columns()
        mle = MultiLabelEncoder(vars)
        X = mle.fit_transform(X)
        self.X_transformers['label_encoder'] = mle
        print(f'Categorical encoding taken {time.time() - start}s')
        return X

    def _discretization(self, X):
        start = time.time()
        logger.info('Data discretization...')
        vars = self.get_continuous_columns()
        mkbd = MultiKBinsDiscretizer(vars)
        X = mkbd.fit_transform(X)
        self.__append_categorical_cols([
            (new_name, bins + 1) for name, new_name, bins in mkbd.new_columns
        ])
        self.X_transformers['discreter'] = mkbd
        print(f'Discretization taken {time.time() - start}s')
        return X

    def _var_len_encoder(self, X, var_len_categorical_columns):
        start = time.time()
        logger.info('Encoder var length feature...')
        transformer = MultiVarLenFeatureEncoder(var_len_categorical_columns)
        X = transformer.fit_transform(X)

        # update var_len_categorical_columns
        for c in self.var_len_categorical_columns:
            _encoder: VarLenFeatureEncoder = transformer._encoders[c.name]
            c.max_elements_length = _encoder.max_element_length

        self.X_transformers['var_len_encoder'] = transformer
        print(f'Encoder taken {time.time() - start}s')
        return X

    def _apply_gbm_features(self, X, y):
        start = time.time()
        logger.info('Extracting GBM features...')
        cont_vars = self.get_continuous_columns()
        cat_vars = self.get_categorical_columns()
        gbmencoder = LgbmLeavesEncoder(cat_vars, cont_vars, self.task_,
                                       **self.config.gbm_params)
        X = gbmencoder.fit_transform(X, y)
        self.X_transformers['gbm_features'] = gbmencoder
        if self.config.gbm_feature_type == consts.GBM_FEATURE_TYPE_EMB:
            self.__append_categorical_cols([
                (name, X[name].max() + 1) for name in gbmencoder.new_columns
            ])
        else:
            self.__append_continuous_cols(
                [name for name in gbmencoder.new_columns],
                consts.INPUT_PREFIX_NUM + 'gbm_leaves')
        print(f'Extracting gbm features taken {time.time() - start}s')
        return X

    def __append_var_len_categorical_col(self, name, voc_size, sep,
                                         pooling_strategy):
        logger.debug(f'Var len categorical variables {name} appended.')

        if self.config.fixed_embedding_dim:
            embedding_output_dim = self.config.embeddings_output_dim if self.config.embeddings_output_dim > 0 else consts.EMBEDDING_OUT_DIM_DEFAULT
        else:
            embedding_output_dim = 0

        if self.var_len_categorical_columns is None:
            self.var_len_categorical_columns = []

        vc = \
            VarLenCategoricalColumn(name,
                                    voc_size,
                                    embedding_output_dim if embedding_output_dim > 0 else min(4 * int(pow(voc_size, 0.25)), 20),
                                    sep=sep,
                                    pooling_strategy=pooling_strategy)

        self.var_len_categorical_columns.append(vc)

    def __append_categorical_cols(self, cols):
        logger.debug(f'{len(cols)} categorical variables appended.')

        if self.config.fixed_embedding_dim:
            embedding_output_dim = self.config.embeddings_output_dim if self.config.embeddings_output_dim > 0 else consts.EMBEDDING_OUT_DIM_DEFAULT
        else:
            embedding_output_dim = 0
            #

        if self.categorical_columns is None:
            self.categorical_columns = []

        if cols is not None and len(cols) > 0:
            self.categorical_columns = self.categorical_columns + \
                                       [CategoricalColumn(name,
                                                          voc_size,
                                                          embedding_output_dim
                                                          if embedding_output_dim > 0
                                                          else min(4 * int(pow(voc_size, 0.25)), 20))
                                        for name, voc_size in cols]

    def __append_continuous_cols(self, cols, input_name):
        if self.continuous_columns is None:
            self.continuous_columns = []
        if cols is not None and len(cols) > 0:
            self.continuous_columns = self.continuous_columns + [
                ContinuousColumn(name=input_name,
                                 column_names=[c for c in cols])
            ]

    def get_categorical_columns(self):
        return [c.name for c in self.categorical_columns]

    def get_var_len_categorical_columns(self):
        if self.var_len_categorical_columns is not None:
            return [c.name for c in self.var_len_categorical_columns]
        else:
            return []

    def get_continuous_columns(self):
        cont_vars = []
        for c in self.continuous_columns:
            cont_vars = cont_vars + c.column_names
        return cont_vars

    def _prepare_cache_dir(self, cache_home, clear_cache=False):
        if cache_home is None:
            cache_home = 'cache'
        if cache_home[-1] == '/':
            cache_home = cache_home[:-1]

        cache_home = os.path.expanduser(f'{cache_home}')
        if not os.path.exists(cache_home):
            os.makedirs(cache_home)
        else:
            if clear_cache:
                shutil.rmtree(cache_home)
                os.makedirs(cache_home)
        cache_dir = f'{cache_home}/{self.signature}'
        if not os.path.exists(cache_dir):
            os.makedirs(cache_dir)
        return cache_dir

    def get_transformed_X_y_from_cache(self, sign):
        file_x_y = f'{self.cache_dir}/X_y_{sign}.h5'
        X_t, y_t = None, None
        if os.path.exists(file_x_y):
            global h5
            try:
                h5 = pd.HDFStore(file_x_y)
                df = h5['data']
                y_t = df.pop('saved__y__')
                X_t = df
            except Exception as e:
                logger.error(e)
                h5.close()
                os.remove(file_x_y)
        return X_t, y_t

    def save_transformed_X_y_to_cache(self, sign, X, y):
        filepath = f'{self.cache_dir}/X_y_{sign}.h5'
        try:
            # x_t = X.copy(deep=True)
            X.insert(0, 'saved__y__', y)
            X.to_hdf(filepath, key='data', mode='w', format='t')
            return True
        except Exception as e:
            logger.error(e)
            if os.path.exists(filepath):
                os.remove(filepath)
        return False

    def load_transformers_from_cache(self):
        transformer_path = f'{self.cache_dir}/transformers.pkl'
        if os.path.exists(transformer_path):
            try:
                with open(transformer_path, 'rb') as input:
                    preprocessor = pickle.load(input)
                    self.__dict__.update(preprocessor.__dict__)
                    return True
            except Exception as e:
                logger.error(e)
                os.remove(transformer_path)
        return False

    def save_transformers_to_cache(self):
        transformer_path = f'{self.cache_dir}/transformers.pkl'
        with open(transformer_path, 'wb') as output:
            pickle.dump(self, output, protocol=2)

    def clear_cache(self):
        shutil.rmtree(self.cache_dir)
        os.makedirs(self.cache_dir)
예제 #60
0
    print("The full size of the synthetic data set is", synthetic_df.shape)
    print("\n Displaying the first five rows of the *real* data set:\n")
    print(real_df.head(5))
    print("The full size of the real data set is", synthetic_df.shape)
    print("\n The features are described as the following: \n")
    print(features_description.to_csv(index=False))

### preprocess the data for machine learning

# y: readmitted
# substitute "NO" with 2, "<30" with 0, ">30" with 1
# real_df.readmitted = real_df.readmitted.replace(["NO", "<30", ">30"], [2, 0, 1])
real_df.readmitted.value_counts()
le_real = LabelEncoder()
real_y = le_real.fit_transform(real_df.readmitted)  # numpy.ndarray
le_real.inverse_transform(real_y)  # maybe for visualisation

synthetic_df.readmitted.value_counts()
le_synthetic = LabelEncoder()
synthetic_y = le_synthetic.fit_transform(synthetic_df.readmitted)  # numpy.ndarray
le_synthetic.inverse_transform(synthetic_y)  # maybe for visualisation

# x
real_df = real_df.drop(labels="readmitted", axis=1)  # axis 1 means columns
real_x = pd.get_dummies(real_df)  # one-hot encode, non-categorical variables will be left unchanged
real_x_columns = real_x.columns  # maybe for visualisation

synthetic_df = synthetic_df.drop(labels="readmitted", axis=1)  # axis 1 means columns
synthetic_x = pd.get_dummies(synthetic_df)  # non-categorical variables will be left unchanged
synthetic_x_columns = synthetic_x.columns  # maybe for visualisation