示例#1
0
文件: runs.py 项目: Sourge/udacity
def linearSVR(data):
    X = data.drop(["id", "date", "price","long","lat", "zipcode","yr_renovated", "sqft_above", "sqft_basement"], axis=1)
    y = data["price"]
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.10, random_state=42)
    svr = LinearSVR(random_state=42)
    svr.fit(X_train, y_train)
    y_predict = svr.predict(X_test)
    print "r2-score for LinearSVR: %f" % r2_score(y_test, y_predict)
示例#2
0
def linearSVR(X, c_param, norm=2):
    if norm == 2:
        XX = normalizeL2(X)

    T = X.shape[0] # temporal length
    clf = LinearSVR(C=c_param, dual=False, loss='squared_epsilon_insensitive', \
                    epsilon=0.1, tol=0.001, verbose=False)  # epsilon is "-p" in C's liblinear and tol is "-e"
    clf.fit(XX, np.linspace(1,T,T))

    return clf.coef_
示例#3
0
    def train(self, trainSet):
        pntNum = trainSet.meanShape.shape[0]
        treeNum = int(self.maxTreeNum/pntNum)
        
        ### Train the random forests
        begTime = time.time()
        for i in xrange(pntNum):
            rf = RandForest(treeDepth = self.treeDepth,
                            treeNum   = treeNum,
                            feaNum    = self.feaNum,
                            radius    = self.radius,
                            binNum    = self.binNum,
                            feaRange  = self.feaRange)
            rf.train(trainSet, i)
            self.rfs.append(rf)
        elapse = getTimeByStamp(begTime, time.time(), 'min')
        print("\t\tRandom Forest     : %f mins"%elapse)

        ### Extract the local binary features
        begTime = time.time()
        feas = self.genFeaOnTrainset(trainSet)
        elapse = getTimeByStamp(begTime, time.time(), 'min')
        print("\t\tExtract LBFs      : %f mins"%elapse)

        ### Global regression
        begTime = time.time()
        y = trainSet.residuals
        y = y.reshape(y.shape[0], y.shape[1]*y.shape[2])
        for i in xrange(pntNum*2):
            ### TODO Show the training result 
            reg=LinearSVR(epsilon=0.0, 
                          C = 1.0/feas.shape[0],
                          loss='squared_epsilon_insensitive',
                          fit_intercept = True)
            reg.fit(feas, y[:, i])
            self.regs.append(reg)
        elapse = getTimeByStamp(begTime, time.time(), 'min')
        print("\t\tGlobal Regression : %f mins"%elapse)

        ### Update the initshapes
        begTime = time.time()
        for i in xrange(pntNum):
            regX = self.regs[2*i]
            regY = self.regs[2*i+1]
            
            x = regX.predict(feas)
            y = regY.predict(feas)
            delta = NP.squeeze(NP.dstack((x,y)))
            delta = Affine.transPntsForwardWithDiffT(delta, 
                                                     trainSet.ms2reals)
            delta = NP.multiply(delta, 
                                trainSet.bndBoxs[:,[2,3]])
            trainSet.initShapes[:,i,:] = trainSet.initShapes[:,i,:] + delta
        elapse = getTimeByStamp(begTime, time.time(), 'min')
        print("\t\tUpdate Shape      : %f mins"%elapse)
def main(train_file, model_file):
    train_x, train_y = load_trainingData(train_file)
    #LR = LinearRegression(normalize = True)
    #LR = Ridge(alpha=0.5)
    #LR = SVR(C=1.0, epsilon=0.2, verbose = True)
    LR = LinearSVR(verbose = 1, epsilon = 0.1)
    logging("training model...")
    starttime = datetime.now()
    LR.fit(train_x, train_y)
    logging("training model, eplased time:%s" % str(datetime.now() - starttime))
    logging("saving model")
    joblib.dump(LR, model_file)
示例#5
0
 def GlobalRegression(self, lbf, shape_residual):
     m = K
     n, f = lbf.shape
     # prepare linear regression X, Y
     X = lbf
     Y = shape_residual / img_o_width
     # parallel
     for i in xrange(landmark_n*2):
         reg = LinearSVR(epsilon=0.0, C=1.0/n, loss='squared_epsilon_insensitive', fit_intercept = True)
         reg.fit(X, Y[:, i])            
         self.w[i] = reg.coef_
     self.w = self.w * img_o_width        
示例#6
0
class SVRR(object):

    def __init__(self, C):
        self.regression = LinearSVR(C=C)

    def fit(self, xs, ys):
        xs = xs.values
        ys = ys['y']
        self.regression.fit(xs, ys)

    def predict(self, xs):
        xs = xs.values
        ys = self.regression.predict(xs)
        return ys
示例#7
0
    def globalRegress(self, posSet, negSet):
        self.feaDim = self.getFeaDim()
        ### Extract the local binary features
        begTime = time.time()
        posFeas = self.genFeaOnTrainset(posSet)
        negFeas = self.genFeaOnTrainset(negSet)
        t = getTimeByStamp(begTime, time.time(), 'min')
        print("\t\tExtract LBFs      : %f mins"%t)

        ### Global regression
        begTime = time.time()
        y = posSet.residuals
        y = y.reshape(y.shape[0], y.shape[1]*y.shape[2])
        for i in xrange(posSet.pntNum*2):
            ### TODO Show the training result 
            reg=LinearSVR(epsilon=0.0, 
                          C = 1.0/posFeas.shape[0],
                          loss='squared_epsilon_insensitive',
                          fit_intercept = True)
            reg.fit(posFeas, y[:, i])
            self.globalReg.append(reg)
        t = getTimeByStamp(begTime, time.time(), 'min')
        print("\t\tGlobal Regression : %f mins"%t)

        ### Update the initshapes
        begTime = time.time()
        for i in xrange(posSet.pntNum):
            regX = self.globalReg[2*i]
            regY = self.globalReg[2*i+1]
            
            x = regX.predict(posFeas)
            y = regY.predict(posFeas)
            delta = NP.squeeze(NP.dstack((x,y)))
            delta = NP.multiply(delta,
                                posSet.winSize)
            posSet.initShapes[:,i,:] = posSet.initShapes[:,i,:] + delta
            x = regX.predict(negFeas)
            y = regY.predict(negFeas)
            delta = NP.squeeze(NP.dstack((x,y)))
            delta = NP.multiply(delta,
                                negSet.winSize)
            negSet.initShapes[:,i,:] = negSet.initShapes[:,i,:] + delta
        t = getTimeByStamp(begTime, time.time(), 'min')

        self.applyPntOffsetIntoTree()
        print("\t\tUpdate Shape      : %f mins"%t)
    class LinearSVRPermuteCoef:
        def __init__(self, **kwargs):
            self.model = LinearSVR(**kwargs)

        def fit(self, X, y):
            self.model.fit(X, y)

            self.coef_ = self.model.coef_
            self.intercept_ = self.model.intercept_

            def add_coef(arr, fn):
                arr.append(fn(self.coef_))

            add_coef(coeffs_state['max'], np.max)
            add_coef(coeffs_state['min'], np.min)

            return self

        def get_params(self, deep=True):
            return self.model.get_params(deep)

        def set_params(self, **kwargs):
            self.model.set_params(**kwargs)
            return self

        def predict(self, X):
            return self.model.predict(X)

        def score(self, X, y, sample_weight=None):
            if sample_weight is not None:
                return self.model.score(X, y, sample_weight)
            else:
                return self.model.score(X, y)

        @staticmethod
        def permute_min_coefs():
            return coeffs_state['min']

        @staticmethod
        def permute_max_coefs():
            return coeffs_state['max']

        @staticmethod
        def reset_perm_coefs():
            coeffs_state['min'] = []
            coeffs_state['max'] = []
def meta_model_fit(X_train, y_train, svm_hardness, fit_intercept, number_of_threads, regressor_type="LinearSVR"):
    """
    Trains meta-labeler for predicting number of labels for each user.

    Based on: Tang, L., Rajan, S., & Narayanan, V. K. (2009, April).
              Large scale multi-label classification via metalabeler.
              In Proceedings of the 18th international conference on World wide web (pp. 211-220). ACM.
    """
    if regressor_type == "LinearSVR":
        if X_train.shape[0] > X_train.shape[1]:
            dual = False
        else:
            dual = True

        model = LinearSVR(C=svm_hardness, random_state=0, dual=dual,
                          fit_intercept=fit_intercept)
        y_train_meta = y_train.sum(axis=1)
        model.fit(X_train, y_train_meta)
    else:
        print("Invalid regressor type.")
        raise RuntimeError

    return model
示例#10
0
def build_svm(x_train, y_train, x_test, y_test, n_features):
    """
    Constructing a support vector regression model from input dataframe
    :param x_train: features dataframe for model training
    :param y_train: target dataframe for model training
    :param x_test: features dataframe for model testing
    :param y_test: target dataframe for model testing
    :return: None
    """

    clf = LinearSVR(random_state=1, dual=False, epsilon=0,
                    loss='squared_epsilon_insensitive')
    # Random state has int value for non-random sampling
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    # Mean absolute error regression loss
    mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
    # Mean squared error regression loss
    mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
    # Median absolute error regression loss
    median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
    # R^2 (coefficient of determination) regression score function
    r2 = sklearn.metrics.r2_score(y_test, y_pred)
    # Explained variance regression score function
    exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)

    with open('../trained_networks/svm_%d_data.pkl' % n_features, 'wb') as results:
        pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)

    return
示例#11
0
del globals()['unqLikesLIDs']
del globals()['profilesDF']
del globals()['profiles']
del globals()['profilesLSo']
del globals()['profilesLS']
del globals()['row']
del globals()['tmpLS']
del globals()['tmpAGE']
del globals()['profsTOlikes']
del globals()['i']
del globals()['tmpIND']

seed = 7
myRand = np.random.seed(seed)
X_train, X_test, y_train, y_test = train_test_split(likesMAT,
                                                    consARR,
                                                    test_size=1500)

myTOL = float(sys.argv[1])
mySVM = LinearSVR(tol=myTOL)
#mySVM.fit(likesMAT, consARR)
mySVM.fit(X_train, y_train)

y_pred = mySVM.predict(X_test)
import math
myRMSE = math.sqrt(metrics.mean_squared_error(y_test, y_pred))
print("cons, Linear SVM:  ", str(myTOL), " ", myRMSE)

# joblib.dump(mySVM, "/Users/jamster/LinearSVM-A-cons.xz", compress=9)

# impSVM = joblib.load("/Users/jamster/LinearSVM-A-cons.xz")
示例#12
0
def submit(feature_files, training_dates, feature_set_folder):
    train_set1 = pd.concat([
        dfs(0, len(feature_files), feature_files + ['y'], 'dataset/' + date)
        for date in training_dates
    ])

    train_set = train_set1[train_set1.time_diff > 15]
    test_set = train_set1[train_set1.time_diff <= 15]

    train_set = train_set.fillna(-1, downcast='infer')
    test_set = test_set.fillna(-1, downcast='infer')

    train_set['y_log'] = train_set['y'].apply(lambda x: np.log(1 + x))
    test_set['y_log'] = test_set['y'].apply(lambda x: np.log(1 + x))

    feature_set = filter(
        lambda x: x not in
        ['y', 'time', 'province', 'market', 'name', 'type', 'y_log'],
        train_set.columns)

    scaler = StandardScaler()
    scaler.fit(train_set[feature_set].as_matrix())
    #
    # model1
    model1 = LinearRegression(normalize=True)
    model1.fit(scaler.transform(train_set[feature_set].as_matrix()),
               train_set['y'].as_matrix(),
               sample_weight=map(lambda x: 1.0 / x / x,
                                 train_set['y'].as_matrix()))
    print zip(feature_set, model1.coef_)
    test_set['predictY'] = model1.predict(
        scaler.transform(test_set[feature_set].as_matrix()))
    test_set.to_csv('result/' + feature_set_folder +
                    '/model1_online_stacking1.csv')
    print test_set

    # model2
    model2 = XGBRegressor(n_estimators=600,
                          learning_rate=0.01,
                          max_depth=6,
                          colsample_bytree=0.7,
                          subsample=0.7,
                          colsample_bylevel=0.7)
    model2.fit(train_set[feature_set].as_matrix(),
               train_set['y'].as_matrix(),
               sample_weight=map(lambda x: 1.0 / x / x,
                                 train_set['y'].as_matrix()))
    test_set['predictY'] = model2.predict(test_set[feature_set].as_matrix())
    test_set.to_csv('result/' + feature_set_folder +
                    '/model2_online_stacking1.csv')

    # model3
    model3 = LinearSVR(tol=1e-7)
    model3.fit(scaler.transform(train_set[feature_set].as_matrix()),
               train_set['y'].as_matrix(),
               sample_weight=map(lambda x: 1.0 / x / x,
                                 train_set['y'].as_matrix()))
    test_set['predictY'] = model3.predict(
        scaler.transform(test_set[feature_set].as_matrix()))
    test_set.to_csv('result/' + feature_set_folder + '/model3_offline.csv')

    # model4
    model4 = RandomForestRegressor(n_estimators=1000,
                                   max_depth=7,
                                   max_features=0.2,
                                   max_leaf_nodes=100)
    model4.fit(train_set[feature_set].as_matrix(),
               train_set['y'].as_matrix(),
               sample_weight=np.array(
                   map(lambda x: 1.0 / x / x, train_set['y'].as_matrix())))
    test_set['predictY'] = model4.predict(test_set[feature_set].as_matrix())
    test_set.to_csv('result/' + feature_set_folder +
                    '/model4_online_stacking1.csv')

    # model5
    model5 = XGBRegressor(n_estimators=600,
                          learning_rate=0.01,
                          max_depth=6,
                          colsample_bytree=0.7,
                          subsample=0.7,
                          colsample_bylevel=0.7,
                          seed=10000)
    model5.fit(train_set[feature_set].as_matrix(),
               train_set['y'].as_matrix(),
               sample_weight=map(lambda x: 1.0 / x / x,
                                 train_set['y'].as_matrix()))
    test_set['predictY'] = model5.predict(test_set[feature_set].as_matrix())
    test_set.to_csv('result/' + feature_set_folder +
                    '/model5_online_stacking1.csv')

    # model6
    model6 = XGBRegressor(n_estimators=600,
                          learning_rate=0.01,
                          max_depth=5,
                          colsample_bytree=0.7,
                          subsample=0.7,
                          colsample_bylevel=0.7)
    model6.fit(train_set[feature_set].as_matrix(),
               train_set['y'].as_matrix(),
               sample_weight=map(lambda x: 1.0 / x / x,
                                 train_set['y'].as_matrix()))
    test_set['predictY'] = model6.predict(test_set[feature_set].as_matrix())
    test_set.to_csv('result/' + feature_set_folder +
                    '/model6_online_stacking1.csv')

    pass
示例#13
0
import csv
from sklearn.svm import LinearSVR as SVR
from PIL import Image
from hand_gen import get_eigenvalues

# Train a linear SVR

npzfile = np.load('hand_data.npz')
X = npzfile['X']
y = npzfile['y']

# we already normalize these values in gen.py
# X /= X.max(axis=0, keepdims=True)

svr = SVR(C=1)
svr.fit(X, y)

# svr.get_params() to save the parameters
# svr.set_params() to restore the parameters

# predict
# testdata = np.load('data.npz')

testdata = []
ratio = 0.05
for i in range(1, 482):
    img = Image.open("data/hand/hand.seq%s.png" % str(i))
    width = int(img.size[0] * ratio)
    height = int(img.size[1] * ratio)
    img = img.resize((width, height), Image.BILINEAR)
    img_data = np.array(img).flatten()
示例#14
0
class Learner:
    """ Class responsible for training models, finding the best fit and making
        rate predictions based on the best fit model.
    """
    def __init__(self, instrument, predictor):
        """ Initialize the Learner class based on a predictor and instrument.

            Args:
                instrument: Instrument object.
                predictor: Predictor object.
        """
        self.instrument = instrument
        self.predictor = predictor
        self.init_learning_model()

    def init_learning_model(self):
        """ Initialize the learning model according to the given predictor.

            Args:
                None.
        """
        if self.predictor.name == 'treeRegressor':
            self.model = DecisionTreeRegressor()
        if self.predictor.name == 'linearSVMRegressor':
            self.model = LinearSVR()

    def get_training_samples(self, end_date):
        """ Retrieve all training samples before the end date.

            Args:
                before: Date object. Retrieve training samples before end_date.

            Returns:
                all_samples: List of TrainingSample.
        """
        last_date = None
        if end_date is not None:
            last_date = end_date - datetime.timedelta(1)
        all_samples = ts.get_samples(instrument=self.instrument,
                                     end=last_date,
                                     order_by=['date'])
        return all_samples

    def learn(self, **kwargs):
        """ Use the training samples for the given instrument to build a
            learning model for the learner.

            Args:
                Named arguments.
                    cv_fold: Integer. Number of folds for cross validation.
                    before: Date object. Use samples before this date.

            Returns:
                best_score: float. Best cross validation score from learning.
        """
        cv_fold = kwargs.get('cv_fold')
        end_date = kwargs.get('before')

        all_training_samples = self.get_training_samples(end_date)
        features = [x.features for x in all_training_samples]
        targets = [x.target for x in all_training_samples]

        self.model.set_params(**self.predictor.parameters)
        scores = cross_val_score(self.model, features, targets, cv=cv_fold)
        ave_score = sum(scores) / len(scores)

        self.model.fit(features, targets)

        return ave_score

    def predict(self, features):
        """ Use trained model to predict profitable change given the features.

            Args:
                features: List of floats.

            Returns:
                Decimal. Predicted profitable change.
        """
        features = np.asarray(features).reshape(1, -1)
        predicted = self.model.predict(features)
        return decimal.Decimal(float(predicted)).quantize(TWO_PLACES)
# for the loss: 'epsilon-insensitive loss' is for L1 and 'squared epsilon-insensitive loss' is for L2
# for L1, we can tune the epsilon value, for L2, we can tune the C value.
# others are the defaults.
regr = LinearSVR(epsilon=0.0,
                 tol=0.0001,
                 C=1.0,
                 loss='squared_epsilon_insensitive',
                 fit_intercept=True,
                 intercept_scaling=1.0,
                 dual=True,
                 verbose=0,
                 random_state=None,
                 max_iter=1000)

# fit the model
regr.fit(X_train, y_train)

# get the prediction
prediction_svm_p = regr.predict(X_test)

# revert the prediction value
prediction_svm_p_ori = prediction_svm_p * (y.max() - y.min()) + y.min()
y_test_ori = np.array(y_test * (y.max() - y.min()) + y.min())

# get the score for this model
score = regr.score(X_test, y_test)
# calculate the mse value for the prediciton.
mse_svm_p = np.mean((prediction_svm_p_ori - y_test_ori)**2)
print("MSE with penalized SVM:", mse_svm_p)
# plot the figure to see the difference between prediction and y_test.
plt.plot(y_test_ori, label='y_test_ori')
class TextLearner(object):
    def __init__(self,data_path,model_path = "./",name = ""):
        self.name = name
        self.data_path = data_path
        self.model_path = model_path
        self.DesignMatrix = []
        self.TestMatrix = []
        self.X_train = []
        self.y_train = [] # not only train but general purpose too
        self.X_test = []
        self.y_test  = []
        self.y_pred = []
        self.vectorizer = None
        self.feature_names = None
        self.chi2 = None
        self.mlModel = None
        self.F = Filter()

    def __enter__(self):
        return self

    def __exit__(self, type, value, traceback):
        self.DesignMatrix = []
        self.TestMatrix = []
        self.X_train = []
        self.y_train = []
        self.X_test = []
        self.y_test  = []
        self.y_pred = []
        self.vectorizer = None
        self.feature_names = None
        self.chi2 = None
        self.mlModel = None
        self.F = None

    def addModelDetails(self,model_p,name = ""):
        self.name = name
        self.model_path = model_p


    def load_data(self,TrTe = 0):               #TrTe => 0-Train  1-Test # returns the dimensions of vectors
        with open( self.data_path, 'rb') as f:
            if TrTe == 0:
                self.DesignMatrix = pickle.load(f)
                return len(self.DesignMatrix[1])
            if TrTe == 1:
                self.TestMatrix = pickle.load(f)
                return len(self.TestMatrix[1])

    def clearOld(self):
        self.X_train = []
        self.y_train = []
        self.X_test = []
        self.y_test  = []
        self.y_pred = []
        self.vectorizer = None
        self.feature_names = None
        self.chi2 = None
        self.mlModel = None


    def process(self,text,default = 0):
        if default == 0:
            text = text.strip().lower().encode("utf-8")
        else:
            text = self.F.process(text)
        return text


    def loadXY(self,TrTe = 0,feature_index = 0,label_index = 1):     #TrTe => 0-Train  1-Test
        if TrTe == 0:
            for i in self.DesignMatrix:
                self.X_train.append(self.process(i[feature_index]))
                self.y_train.append(i[label_index])
            self.X_train = np.array(self.X_train)
            self.y_train = np.array(self.y_train)

        elif TrTe == 1:
            for i in self.TestMatrix:
                self.X_test.append(self.process(i[feature_index]))
                self.y_test.append(i[label_index])
            self.X_test = np.array(self.X_test)
            self.y_test = np.array(self.y_test)


    def featurizeXY(self,only_train = 1):      # Extracts Features
        sw = ['a', 'across', 'am', 'an', 'and', 'any', 'are', 'as', 'at', 'be', 'been', 'being', 'but', 'by', 'can', 'could', 'did', 'do', 'does', 'each', 'for', 'from', 'had', 'has', 'have', 'in', 'into', 'is', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'of', 'on', 'or', 'that', "that's", 'thats', 'the', 'there', "there's", 'theres', 'these', 'this', 'those', 'to', 'under', 'until', 'up', 'were', 'will', 'with', 'would']
        self.vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words=sw)
        self.X_train = self.vectorizer.fit_transform(self.X_train)
        self.feature_names = self.vectorizer.get_feature_names()
        if only_train == 0:
            self.X_test = self.vectorizer.transform(self.X_test)


    def reduceDimension(self,only_train = 1, percent = 50):      # Reduce dimensions / self best of features
        n_samples, n_features = self.X_train.shape
        k = int(n_features*(percent/100))

        self.chi2 = SelectKBest(chi2, k=k)
        self.X_train = self.chi2.fit_transform(self.X_train, self.y_train)
        self.feature_names = [self.feature_names[i] for i in self.chi2.get_support(indices=True)]
        self.feature_names = np.asarray(self.feature_names)
        if only_train == 0:
            self.X_test = self.chi2.transform(self.X_test)


    def trainModel(self,Model = "default"):
        if Model == "default":
            self.mlModel = LinearSVR(loss='squared_epsilon_insensitive',dual=False, tol=1e-3)
        else:
            self.mlModel = Model
        self.mlModel.fit(self.X_train, self.y_train)


    def testModel(self,approx = 1):        # returns score ONLY
        self.y_pred = np.array(self.mlModel.predict(self.X_test))

        if approx == 1:
            ### To convert real valued results to binary for scoring
            temp = []
            for y in self.y_pred:
                if y > 0.0:
                    temp.append(1.0)
                else:
                    temp.append(-1.0)
            self.y_pred = temp

        return metrics.accuracy_score(self.y_test, self.y_pred)


    def getReport(self,save = 1, get_top_words = 0):       # returns report
        report = ""
        if get_top_words == 1:
            if hasattr(self.mlModel, 'coef_'):
                    report += "Dimensionality: " + str(self.mlModel.coef_.shape[1])
                    report += "\nDensity: " +  str(density(self.mlModel.coef_))

                    rank = np.argsort(self.mlModel.coef_[0])
                    top10 = rank[-20:]
                    bottom10 = rank[:20]
                    report += "\n\nTop 10 keywords: "
                    report += "\nPositive: " + (" ".join(self.feature_names[top10]))
                    report += "\nNegative: " + (" ".join(self.feature_names[bottom10]))

        score = metrics.accuracy_score(self.y_test, self.y_pred)
        report += "\n\nAccuracy: " + str(score)
        report += "\nClassification report: "
        report += "\n\n" + str(metrics.classification_report(self.y_test, self.y_pred,target_names=["Negative","Positive"]))
        report += "\nConfusion matrix: "
        report += "\n\n" + str(metrics.confusion_matrix(self.y_test, self.y_pred)) + "\n\n"

        if save == 1:
            with open(self.model_path + "report.txt", "w") as text_file:
                text_file.write(report)

        return report


    def crossVal(self,folds = 5, dim_red = 50,full_iter = 0, save = 1):        # returns report # Caution: resets train and test X,y
        skf = cross_validation.StratifiedKFold(self.y_train, n_folds = folds,shuffle=True)
        print(skf)
        master_report = ""

        X_copy = self.X_train
        y_copy = self.y_train

        for train_index, test_index in skf:
            self.X_train, self.X_test = X_copy[train_index], X_copy[test_index]
            self.y_train, self.y_test = y_copy[train_index], y_copy[test_index]
            self.featurizeXY(0)
            self.reduceDimension(0,dim_red)
            self.trainModel()
            self.testModel()
            master_report += self.getReport(save = 0,get_top_words = 0)
            if full_iter == 1:
                continue
            else:
                break

        if save == 1:
            with open(self.model_path + "master_report.txt", "w") as text_file:
                text_file.write(master_report)

        return master_report


    def save_obj(self,obj, name ):
        with open(self.model_path + name + '.pkl', 'wb') as f:
            pickle.dump(obj, f,  protocol=2)


    def saveModel(self):        # saves in model path
        self.save_obj(self.mlModel, self.name + "_model")
        self.save_obj(self.vectorizer, self.name + "_vectorizer")
        self.save_obj(self.chi2, self.name + "_feature_selector")


    def plot(self):
        '''
        beta (Just plotting the model) (Not working)
        '''

        h = .02  # step size in the mesh
        # create a mesh to plot in
        x_min, x_max = self.X_train[:, 0].min() - 1, self.X_train[:, 0].max() + 1
        y_min, y_max = self.X_train[:, 1].min() - 1, self.X_train[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),np.arange(y_min, y_max, h))

        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, m_max]x[y_min, y_max].
        Z = self.mlModel.predict(np.c_[xx.ravel(), yy.ravel()])

        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        plt.contour(xx, yy, Z, cmap=plt.cm.Paired)

        plt.xlim(xx.min(), xx.max())
        plt.ylim(yy.min(), yy.max())
        plt.xticks(())
        plt.yticks(())
        plt.title(self.name)
        plt.savefig(self.model_path + 'plot.png')
示例#17
0
                                            coef0=1,
                                            C=5))))
    poly_kernel_svm_clf.fit(X, y)

    rbf_kernel_svm_clf = Pipeline(
        (("scaler", StandardScaler()), ("svm_clf",
                                        SVC(kernel="rbf", gamma=5, C=0.001))))
    rbf_kernel_svm_clf.fit(X, y)
    """
    LinearSVC比SVC快得多(ker nel =“linear”)),特别是如果训练集非常大或者它有很多特征。
    如果训练集不太大,则应该尝试高斯RBF内核;它在大多数情况下运作良好。
    
    """

if False:
    from sklearn.svm import LinearSVR
    """
    epsilon -> street width
    C large regularization small
    
    """
    svm_reg = LinearSVR(epsilon=1.5)
    svm_reg.fit(X, y)
    """
    SVR类是SVC类的回归等价物,LinearSVR类是LinearSVC类的回归等价物。 
    LinearSVR类与训练集的大小成线性关系(就像LinearSVC类一样),而当训练集变大时SVR类变得太慢(就像SVC类一样)
    """
    from sklearn.svm import SVR

    svm_poly_reg = SVR(kernel="poly", degree=2, C=100, epsilon=0.1)
    svm_poly_reg.fit(X, y)
示例#18
0
# tuning:
# increasing gamma makes the bell curves narrower, each instance less influencing - the decision boundary is wiggly
# decreasing gamma broadens the bell shape, the decision boundary is smoother
# if overfitting reduce gamma, if underitting, increase gamma
# string kernel for text data

########################### SVM REGRESSION #########################################################
## it tries to balance the opposite of SVM classifier
## it tries to fit as many instances as possible on the "street"
## while limiting margin violations
# the width of the street is controlled by epsilon (margin)
# epsilon-insensitive regression: adding more training instances within the margins doesn't influence the 

from sklearn.svm import LinearSVR
svm_reg = LinearSVR(epsilon = 1.5)
svm_reg.fit(x,y)

# C can be used as a regulator hyperparameter, by decreasing C we apply more regularization
# nonlinearity with polynomial

from sklearn.svm import SVR
svm_poly_reg = SVR(kernel = "poly", degree = 2, epsilon=0.1, C=100)
svm_poly_reg.fit(x,y)

############## SVM in detail ####################
# notation:

# b - the bias term
# w - the feature weights

# 1. linear SVM CLF
示例#19
0
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error 
from sklearn.linear_model import SGDRegressor


#Importing the dataset

df = pd.read_csv("finalEncoded.csv")
y = df['price']
X = df.drop(columns=['price'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

svr = LinearSVR(epsilon=0.01, C=0.01, fit_intercept=True)

svr.fit(X_train, y_train)


def svr_results(y_test, X_test, fitted_svr_model):
    
    print("C: {}".format(fitted_svr_model.C))
    print("Epsilon: {}".format(fitted_svr_model.epsilon))
    
    print("Intercept: {:,.3f}".format(fitted_svr_model.intercept_[0]))
    print("Coefficient: {:,.3f}".format(fitted_svr_model.coef_[0]))
    
    mae = mean_absolute_error(y_test, fitted_svr_model.predict(X_test))
    print("MAE = ${:,.2f}".format(1000*mae))
    
    perc_within_eps = 100*np.sum(y_test - fitted_svr_model.predict(X_test) < 5) / len(y_test)
    print("Percentage within Epsilon = {:,.2f}%".format(perc_within_eps))
示例#20
0
from sklearn.svm import LinearSVR  # 导入线性回归类
from sklearn.datasets import load_boston  # 导入加载波士顿数据集
from pandas import DataFrame  # 导入DataFrame
boston = load_boston()  # 创建加载波士顿数据对象
# 将波士顿房价数据创建为DataFrame对象
df = DataFrame(boston.data, columns=boston.feature_names)
df.insert(0, 'target', boston.target)  # 将价格添加至DataFrame对象中
data_mean = df.mean()  # 获取平均值
data_std = df.std()  # 获取标准偏差
data_train = (df - data_mean) / data_std  # 数据标准化
x_train = data_train[boston.feature_names].values  # 特征数据
y_train = data_train['target'].values  # 目标数据
linearsvr = LinearSVR(C=0.1)  # 创建LinearSVR()对象
linearsvr.fit(x_train, y_train)  # 训练模型
# 预测,并还原结果
x = ((df[boston.feature_names] - data_mean[boston.feature_names]) /
     data_std[boston.feature_names]).values
# 添加预测房价的信息列
df[u'y_pred'] = linearsvr.predict(x) * data_std['target'] + data_mean['target']
print(df[['target', 'y_pred']].head())  #输出真实价格与预测价格
示例#21
0
文件: 4.SVM.py 项目: aoji-tjut/ML
plt.title("SVC_RBF")
boundary(svc_rbf, [4, 8.5, 1.75, 4.75])
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.scatter(X[y == 2, 0], X[y == 2, 1])
plt.axis([4, 8.5, 1.75, 4.75])

# 回归------------------------------------------------------------------------------------------------------------------
# 线性回归
x = np.linspace(0, 100, 100)
X = x.reshape(-1, 1)
y = 2 * x + 5 + np.random.uniform(-10, 10, 100)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

svr_line = LinearSVR(max_iter=1e5)
svr_line.fit(X_train, y_train)
y_predict = svr_line.predict(X_test)
print("svr_line r2_score =", r2_score(y_test, y_predict))

plt.sca(ax3)
plt.title("LinearSVR")
plt.scatter(X_train, y_train, c='b')
plt.plot(X_test, y_predict, c='r')

# 非线性回归
x = np.linspace(-2, 2, 100)
X = x.reshape(-1, 1)
y = 0.5 * x ** 2 + 2 * x + 3 + np.random.normal(0, 0.5, 100)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

svr_rbf = SVR(kernel="rbf", gamma=0.5)
def linearSVR(XTrain, YTrain, XTest, YTest, **options):
    
    import numpy as np
    from sklearn.svm import LinearSVR
#    from sklearn import svm
#    linear_svr = [];
    # Create a classifier: a support vector classifier
    '''
    if options.get('l1'):
        l1 = options.get('l1');
        #print 'running l1 svm classification\r' 
        linear_svr = svm.LinearSVR(C = l1, loss='squared_hinge', penalty='l1', dual=False)
    elif options.get('l2'):
        l2 = options.get('l2');        
        #print 'running l2 svm classification\r' 
        linear_svr = svm.LinearSVR(C = l2, loss='squared_hinge', penalty='l2', dual=True)
        '''        
    l = options.get('l');   
    linear_svr = LinearSVR(C=l, epsilon=0.0, dual = True, tol = 1e-9, fit_intercept = True)        
    linear_svr.fit(XTrain, np.squeeze(YTrain))    


    #%%
#    def perClassError_sr(Y,Yhat,eps0=10**-5):    
#        ce = np.mean(np.logical_and(abs(Y-Yhat) > eps0 , ~np.isnan(Yhat - Y)))*100
#        return ce
    def perClassError_sr(y,yhat):
        err = np.linalg.norm(yhat - y)**2
        maxerr = np.linalg.norm(y+1e-10)**2
#        err = (np.linalg.norm(yhat - y)**2)/len(y)
#        maxerr = np.linalg.norm(y)**2
#        ce = err
        ce = err/ maxerr    
    #    ce = np.linalg.norm(yhat - y)**2 / len(y)
        return ce
    
    perClassErrorTest = perClassError_sr(YTest, linear_svr.predict(XTest));
    perClassErrorTrain = perClassError_sr(YTrain, linear_svr.predict(XTrain));
    
    
    #%%
    class summaryClass:
        perClassErrorTrain = [];
        perClassErrorTest = [];
        model = [];
        XTest = []
        XTrain = []
        YTest = []
        YTrain = []
        
    summary = summaryClass();
    summary.perClassErrorTrain = perClassErrorTrain;
    summary.perClassErrorTest = perClassErrorTest;
    summary.model = linear_svr;
    summary.XTest = XTest
    summary.XTrain = XTrain
    summary.YTest = YTest
    summary.YTrain = YTrain
    
    return summary

#    np.mean()
    
    
    print "----------- Fold %d -----------------------" %i
    print "--------------------------------------------"
    
    val_id = fold_ids.ix[:, i].dropna()
    idx = train["Id"].isin(list(val_id))
    
    trainingSet = train[~idx]
    validationSet = train[idx]
    
    tr_X = np.matrix(trainingSet[feature_names])
    tr_Y = np.array(trainingSet["Response"])
    val_X = np.matrix(validationSet[feature_names])
    val_Y = np.array(validationSet["Response"])
    
    regm = LinearSVR(C = 0.06, epsilon = 0.45, tol = 1e-5,
                     dual = True, verbose = True, random_state = 133)
                     
    regm.fit(tr_X, tr_Y)    
    preds = regm.predict(val_X)
    
    df = pd.DataFrame(dict({"Id" : validationSet["Id"], "ground_truth" : validationSet["Response"], 
                            "linsvr_preds" : preds}))
    
    linsvr_val = linsvr_val.append(df, ignore_index = True)
    
    tpreds = regm.predict(test_X)
    cname = "Fold" + `i`
    linsvr_test[cname] = tpreds
    
linsvr_val.to_csv("ensemble2/linsvr_val.csv")
linsvr_test.to_csv("ensemble2/linsvr_test.csv")
示例#24
0
train_data.drop(['嗜碱细胞%'], axis=1, inplace=True)
test_dataA.drop(['嗜碱细胞%'], axis=1, inplace=True)
#对列的空值进行填充
for i in train_data.columns:
    train_data[i].fillna(train_data[i].mean(), inplace=True)
for i in test_dataA:
    test_dataA[i].fillna(test_dataA[i].mean(), inplace=True)
train_data_y = train_data['血糖']
train_data.drop(['血糖'], axis=1, inplace=True)
#归一化
scaler = StandardScaler()
train_data = scaler.fit_transform(train_data.astype(float))
test_dataA = scaler.transform(test_dataA.astype(float))
#建立模型
lin_svr = LinearSVR(random_state=42, max_iter=5000)
lin_svr.fit(train_data, train_data_y)
test_features_labers = lin_svr.predict(test_dataA)
#评估模型
mse = mean_squared_error(test_labels, test_features_labers)
print("平均偏离值: %f " % (mse))
print("均方误差:%f" % (np.sqrt(mse)))
#使用RandomizedSearchCV
param_distributions = {
    'gamma': reciprocal([0.001, 0.1]),
    'C': [uniform(1, 10), uniform(1, 10)]
}
rnd_search_cv = RandomizedSearchCV(SVR(),
                                   param_distributions,
                                   n_iter=10,
                                   verbose=2,
                                   cv=3,
示例#25
0
    data,target = [],[]
    for row in csv.reader(data_file):
        data += [[row[0],row[4],row[6],row[10]]]
        target += [row[9]]

data,target = Lin_clean_data(data[1:],target[1:],2)

point = 2000
X_train = data[:point-1]
X_test = data[point:point+int(point*0.2)]
y_train = target[:point-1]
y_test = target[point:point+int(point*0.2)]


svr = LinearSVR(C=0.1)
svr_model = svr.fit(X_train,y_train)
lin = svr.predict(X_train)
lin_test = svr.predict(X_test)

lin,lin_test = data_normalize(y_train,y_test,lin,lin_test)

print("Train score : ",score(y_train,lin))
print("Train average error : ",sum(abs(y_train-lin)) / float(len(y_train)))

print("Fit score : ",score(y_test,lin_test))
print("Fit average error : ",sum(abs(y_test-lin_test)) / float(len(y_test)))

figure1 = plt.figure(1,figsize=[20,10])
draw_pic(range(len(X_train)),range(len(X_test)),lin,lin_test,y_train,y_test,label='lin',figure=figure1)
figure1.savefig("C:/Users/sean/Desktop/SVR_DATA/linSVR.png",dpi=300,format="png")
plt.close(1)
示例#26
0
class AllRegressionModels:
    """
    Wrapper class around all supported regression models: LinearRegression, RandomForest, SVR, NuSVR, LinearSVR, and
    XGBRegressor.
    AllRegressionModels runs every available regression algorithm on the given dataset and outputs the coefficient of
    determination and execution time of each successful model when all_regression_models() is run.
    """
    def __init__(self, attributes=None, labels=None, test_size=0.25, verbose=False):
        """
        Initializes an AllRegressionModels object.

        The following parameters are needed to use an AllRegressionModels object:

            – attributes: a numpy array of the desired independent variables (Default is None)
            – labels: a numpy array of the desired dependent variables (Default is None)
            – test_size: the proportion of the dataset to be used for testing the model;
            the proportion of the dataset to be used for training will be the complement of test_size (Default is 0.25)
            – verbose: specifies whether or not to ouput any and all logging during model training (Default is False)

            Note: These are the only parameters allowed. All other parameters for each model will use their default
            values. For more granular control, please instantiate each model individually.

        The following instance data is found after running all_regression_models() successfully:

            – linear_regression: a reference to the LinearRegression model
            – random_forest: a reference to the RandomForest model
            – SVR: a reference to the SVR model
            – nu_SVR: a reference to the NuSVR model
            – linear_SVR: a reference to the LinearSVR model
            – XGB_regressor: a reference to the XGBRegressor model
        
        After running all_regression_models(), the coefficient of determination and execution time for each model that
        ran successfully will be displayed in tabular form. Any models that failed to run will be listed.
        """
        self.attributes = attributes
        self.labels = labels
        self.test_size = test_size
        self.verbose = verbose

        self.linear_regression = LinearRegression()
        self.random_forest = RandomForestRegressor(verbose=self.verbose)
        self.SVR = SVR(verbose=self.verbose)
        self.nu_SVR = NuSVR(verbose=self.verbose)
        self.linear_SVR = LinearSVR(verbose=self.verbose)
        self.XGB_regressor = XGBRegressor(verbosity=int(self.verbose))

        self._regression_models = {"Model": ["R2 Score", "Time"]}
        self._failures = []

    # Accessor methods

    def get_attributes(self):
        """
        Accessor method for attributes.

        If an AllRegressionModels object is initialized without specifying attributes, attributes will be None.
        all_regression_models() cannot be called until attributes is a populated numpy array of independent variables;
        call set_attributes(new_attributes) to fix this.
        """
        return self.attributes

    def get_labels(self):
        """
        Accessor method for labels.

        If an AllRegressionModels object is initialized without specifying labels, labels will be None.
        all_regression_models() cannot be called until labels is a populated numpy array of dependent variables;
        call set_labels(new_labels) to fix this.
        """
        return self.labels

    def get_test_size(self):
        """
        Accessor method for test_size.

        Should return a number or None.
        """
        return self.test_size

    def get_verbose(self):
        """
        Accessor method for verbose.

        Will default to False if not set by the user.
        """
        return self.verbose

    def get_all_regression_models(self):
        """
        Accessor method that returns a list of all models.

        All models within the list will be None if all_regression_models() hasn't been called, yet.
        """
        return [self.linear_regression, self.random_forest, self.SVR, self.nu_SVR, self.linear_SVR, self.XGB_regressor]

    def get_linear_regression(self):
        """
        Accessor method for linear_regression.

        Will return None if all_regression_models() hasn't been called, yet.
        """
        return self.linear_regression

    def get_random_forest(self):
        """
        Accessor method for random_forest.

        Will return None if all_regression_models() hasn't been called, yet.
        """
        return self.random_forest

    def get_SVR(self):
        """
        Accessor method for SVR.

        Will return None if all_regression_models() hasn't been called, yet.
        """
        return self.SVR

    def get_nu_SVR(self):
        """
        Accessor method for nu_SVR.

        Will return None if all_regression_models() hasn't been called, yet.
        """
        return self.nu_SVR

    def get_linear_SVR(self):
        """
        Accessor method for linear_SVR.

        Will return None if all_regression_models() hasn't been called, yet.
        """
        return self.linear_SVR

    def get_XGB_regressor(self):
        """
        Accessor method for XGB_regressor.

        Will return None if all_regression_models() hasn't been called, yet.
        """
        return self.XGB_regressor

    # Modifier methods

    def set_attributes(self, new_attributes=None):
        """
        Modifier method for attributes.

        Input should be a numpy array of independent variables. Defaults to None.
        """
        self.attributes = new_attributes

    def set_labels(self, new_labels=None):
        """
        Modifier method for labels.

        Input should be a numpy array of dependent variables. Defaults to None.
        """
        self.labels = new_labels

    def set_test_size(self, new_test_size=0.25):
        """
        Modifier method for test_size.

        Input should be a number or None. Defaults to 0.25.
        """
        self.test_size = new_test_size

    def set_verbose(self, new_verbose=False):
        """
        Modifier method for verbose.

        Input should be a truthy/falsy value. Defaults to False.
        """
        self.verbose = new_verbose

    # Regression functionality

    def all_regression_models(self):
        """
        Driver method for running all regression models with given attributes and labels.
        all_regression_models() first trains the models and determines their coefficients of determination and
        execution time via _all_regression_models_runner(). Then, all_regression_models() calls _print_results() to
        format and print each successful model's measurements, while also listing any failed models.

        If verbose is True, all verbose logging for each model will be enabled.
        If verbose is False, all logging to stdout and stderr will be suppressed.
        """

        # Call helper method for running all regression models; suppress output, if needed
        if not self.verbose:
            suppress_output = io.StringIO()
            with redirect_stderr(suppress_output), redirect_stdout(suppress_output):
                self._all_regression_models_runner()
        else:
            self._all_regression_models_runner()
        
        # Print results
        self._print_results()
        
    # Helper methods

    def _all_regression_models_runner(self):
        """
        Helper method that runs all models using the given dataset and all default parameters.
        After running all models, each model is determined to be either a success or failure, and relevant data
        (R2 score, execution time) is recorded.

        _all_regression_models_runner() may only be called by all_regression_models().
        """

        # Split dataset
        dataset_X_train, dataset_X_test, dataset_y_train, dataset_y_test =\
            train_test_split(self.attributes, self.labels, test_size=self.test_size)

        # Run and time all models; identify each as success or failure
        try:
            start_time = time.time()
            self.linear_regression.fit(dataset_X_train, dataset_y_train)
            end_time = time.time()
            self._regression_models["LinearRegression"] =\
                [self.linear_regression.score(dataset_X_test, dataset_y_test), end_time - start_time]
        except:
            self._failures.append("LinearRegression")

        try:
            start_time = time.time()
            self.random_forest.fit(dataset_X_train, dataset_y_train)
            end_time = time.time()
            self._regression_models["RandomForest"] =\
                [self.random_forest.score(dataset_X_test, dataset_y_test), end_time - start_time]
        except:
            self._failures.append("RandomForest")

        try:        
            start_time = time.time()
            self.SVR.fit(dataset_X_train, dataset_y_train)
            end_time = time.time()
            self._regression_models["SVR"] = [self.SVR.score(dataset_X_test, dataset_y_test), end_time - start_time]
        except:
            self._failures.append("SVR")
        
        try:
            start_time = time.time()
            self.nu_SVR.fit(dataset_X_train, dataset_y_train)
            end_time = time.time()
            self._regression_models["NuSVR"] = [self.nu_SVR.score(dataset_X_test, dataset_y_test), end_time - start_time]
        except:
            self._failures.append("NuSVR")

        try:
            start_time = time.time()
            self.linear_SVR.fit(dataset_X_train, dataset_y_train)
            end_time = time.time()
            self._regression_models["LinearSVR"] =\
                [self.linear_SVR.score(dataset_X_test, dataset_y_test), end_time - start_time]
        except:
            self._failures.append("LinearSVR")

        try:
            start_time = time.time()
            self.XGB_regressor.fit(dataset_X_train, dataset_y_train)
            end_time = time.time()
            self._regression_models["XGBRegressor"] =\
                [self.XGB_regressor.score(dataset_X_test, dataset_y_test), end_time - start_time]
        except:
            self._failures.append("XGBRegressor")
        
    def _print_results(self):
        """
        Helper method that prints results of _all_regression_models_runner() in tabular form.

        _print_results() may only be called by all_regression_models() after all models have attempted to run.
        """

        # Print models that didn't fail
        print("\nResults:\n")

        for model, data in self._regression_models.items():
            print("{:<20} {:<20} {:<20}".format(model, data[0], data[1]))

        print()

        # Print failures, if any
        if len(self._failures) > 0:
            print("The following models failed to run:\n")

            for entry in self._failures:
                print(entry)
        
        print()
from sklearn.svm import LinearSVR
from sklearn.datasets import make_regression
import matplotlib.pyplot as plt
import random
from matplotlib.pyplot import figure
test_data = pd.read_csv("Sample.csv")
reference = pd.read_csv("Reference.csv")
train = reference.drop("MF_name",1)
test_data = test_data.drop("gene",1)

score_adj = []
for o in range(len(test_data.columns)):
    test = test_data.loc[:,test_data.columns[o]]
    im_name = train.columns
    svr = LinearSVR(random_state=0)
    model = svr.fit(train, test)
    score = model.coef_
    score[np.where(score<0)] = 0 
    score_adj.append((score/sum(score)))
score_adj = pd.DataFrame(score_adj)
score_adj.columns = im_name
score_adj.plot(kind='bar', stacked=True,legend=False)
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.suptitle("Flow&Estimate")
plt.rcParams['figure.figsize'] = (6.69,8.86)
plt.rcParams['figure.dpi'] = 300
name = "bar.pdf"
plt.savefig(name,bbox_inches="tight" )
plt.close()
plt.boxplot(score_adj.T,patch_artist = True)
plt.suptitle("Flow&Estimate")
def train_linear(x, y):
    model_linear = LinearSVR(C=1, tol=1e-5, max_iter=1000)
    model_linear.fit(x, y)
    return model_linear
示例#29
0
print ''

lsvc = LinearSVC()
print 'LinearSVC config:'
print lsvc.get_params()
lsvc.fit(smr_train.feature_matrix, smr_train.labels)
lsvc_score_train = lsvc.score(smr_train.feature_matrix, smr_train.labels)
print 'LinearSVC precision train: {}'.format(lsvc_score_train)
lsvc_score_test = lsvc.score(smr_test.feature_matrix, smr_test.labels)
print 'LinearSVC precision test: {}'.format(lsvc_score_test)
print ''

lsvr = LinearSVR()
print 'LinearSVR config:'
print svc.get_params()
lsvr.fit(smr_train.feature_matrix, smr_train.labels)
lsvr_score_train = svc.score(smr_train.feature_matrix, smr_train.labels)
print 'LinearSVR precision train: {}'.format(lsvr_score_train)
lsvr_score_test = lsvr.score(smr_test.feature_matrix, smr_test.labels)
print 'LinearSVR precision test: {}'.format(lsvr_score_test)
print ''

nusvc = NuSVC()
print 'NuSVC config:'
print nusvc.get_params()
nusvc.fit(smr_train.feature_matrix, smr_train.labels)
nusvc_score_train = nusvc.score(smr_train.feature_matrix, smr_train.labels)
print 'NuSVC precision train: {}'.format(nusvc_score_train)
nusvc_score_test = nusvc.score(smr_test.feature_matrix, smr_test.labels)
print 'NuSVC precision test: {}'.format(nusvc_score_test)
print ''
示例#30
0
#dictvalues ={}
#for coldata in data_new.columns:
#    dictvalues[coldata] = datatest[coldata].mean()

#print('dictvalues values')
#print(dictvalues)

##print('sorted output')
#from operator import itemgetter
#print(sorted(dictvalues.items(), key=itemgetter(1),reverse=True))

#regr = linear_model.Lasso(alpha=0.1)
regr = LinearSVR(C=1.0, epsilon=0.2)
#regr = RandomForestRegressor()
#regr = AdaBoostRegressor(n_estimators=80)
regr.fit(data_new[features], y)

predictions = regr.predict(datatest)
print('predictions')
print(predictions)

datatest_result = pd.read_csv('test.csv',header=0)
datatest_result['loss'] = np.exp(predictions)
header = ["id","loss"]
datatest_result.to_csv("Results_AllState_SVR_81.csv", sep=',', columns = header,index=False)

for col in data.columns[:-1]: 
    print(data[col].unique())

    ## KneighborsRegressor
    from sklearn.neighbors import KNeighborsRegressor
    knreg = KNeighborsRegressor(n_neighbors=5)
    knreg.fit(X_train, y_train)
    score_list.append(knreg.score(X_test, y_test))

    ##  Support Vector Regressor
    from sklearn.svm import SVR
    svm_reg = SVR(kernel='poly', gamma='auto', degree=2, C=5, epsilon=0.1)
    svm_reg.fit(X_train, y_train)
    score_list.append(svm_reg.score(X_test, y_test))

    ## linearSVR
    from sklearn.svm import LinearSVR
    sv_reg = LinearSVR(max_iter=1000)
    sv_reg.fit(X_train, y_train)
    score_list.append(sv_reg.score(X_test, y_test))

    ## random forest
    from sklearn.ensemble import RandomForestRegressor
    rf_reg = RandomForestRegressor(max_depth=5)
    rf_reg.fit(X_train, y_train)
    score_list.append(rf_reg.score(X_test, y_test))
    '''
    ## LightGBM
    import lightgbm as lgb
    lgb_reg=lgb.LGBMRegressor(objective='regression')
    lgb_reg.fit(X_train, y_train)
    score_list.append(lgb_reg.score(X_test, y_test))
    '''
    '''
示例#32
0
def default_datasets(carrier, id_airport):
    # # **Predicting flight delays**

    # In this notebook, we developed the model aimed at predicting flight delays at take-off.

    # During the EDA, we intended to create good quality figures

    # This notebook is composed of three parts:
    # Cleaning
    #   *  Date and Times
    #   *  Missing Values

    # Exploration
    #   * Graphs
    #   * Impact of Departure Vs Arrival Delays

    # Modeling
    # The model is developed for one airport and one airline
    #   * Linear
    #   * Ridge
    #   * Random Forest
    #   * Neural Networks
    #   * SVM

    # In[2]:

    import datetime, warnings, scipy
    import pandas as pd
    import numpy as np
    import seaborn as sns
    import matplotlib as mpl
    import matplotlib.pyplot as plt
    import matplotlib.patches as patches
    from matplotlib.patches import ConnectionPatch
    from collections import OrderedDict
    from matplotlib.gridspec import GridSpec
    from sklearn import metrics, linear_model
    from sklearn.preprocessing import PolynomialFeatures, StandardScaler
    from sklearn.preprocessing import LabelEncoder, OneHotEncoder
    from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
    from scipy.optimize import curve_fit
    from sklearn.metrics import r2_score
    from random import sample
    import matplotlib.patches as mpatches
    from sklearn.linear_model import Ridge
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.metrics import r2_score
    from scipy.stats import spearmanr, pearsonr
    from sklearn.svm import SVR
    plt.rcParams["patch.force_edgecolor"] = True
    plt.style.use('fivethirtyeight')
    mpl.rc('patch', edgecolor='dimgray', linewidth=1)
    from IPython.core.interactiveshell import InteractiveShell
    InteractiveShell.ast_node_interactivity = "last_expr"
    pd.options.display.max_columns = 50
    #get_ipython().magic('matplotlib inline')
    warnings.filterwarnings("ignore")

    # In[2]:

    df = pd.read_csv(
        '/Users/sarveshprattipati/Downloads/flight-delays/flights.csv',
        low_memory=False)
    print('Dataframe dimensions:', df.shape)

    airports = pd.read_csv(
        "/Users/sarveshprattipati/Downloads/flight-delays/airports.csv")

    airlines_names = pd.read_csv(
        '/Users/sarveshprattipati/Downloads/flight-delays/airlines.csv')
    airlines_names

    abbr_companies = airlines_names.set_index('IATA_CODE')['AIRLINE'].to_dict()

    carrier = 'AA'
    id_airport = 'DFW'

    # %%

    # # 1. Cleaning

    # # 1.1 Dates and times
    #
    # **YEAR, MONTH, DAY**, is merged into date column

    df['DATE'] = pd.to_datetime(df[['YEAR', 'MONTH', 'DAY']])

    # Moreover, in the **SCHEDULED_DEPARTURE** variable, the hour of the take-off is coded as a float where the two first digits indicate the hour and the two last, the minutes. This format is not convenient and I thus convert it. Finally, I merge the take-off hour with the flight date. To proceed with these transformations, I define a few functions:

    # Function that converts the 'HHMM' string to datetime.time
    def format_heure(chaine):
        if pd.isnull(chaine):
            return np.nan
        else:
            if chaine == 2400: chaine = 0
            chaine = "{0:04d}".format(int(chaine))
            heure = datetime.time(int(chaine[0:2]), int(chaine[2:4]))
            return heure

    # Function that combines a date and time to produce a datetime.datetime
    def combine_date_heure(x):
        if pd.isnull(x[0]) or pd.isnull(x[1]):
            return np.nan
        else:
            return datetime.datetime.combine(x[0], x[1])

    # Function that combine two columns of the dataframe to create a datetime format
    def create_flight_time(df, col):
        liste = []
        for index, cols in df[['DATE', col]].iterrows():
            if pd.isnull(cols[1]):
                liste.append(np.nan)
            elif float(cols[1]) == 2400:
                cols[0] += datetime.timedelta(days=1)
                cols[1] = datetime.time(0, 0)
                liste.append(combine_date_heure(cols))
            else:
                cols[1] = format_heure(cols[1])
                liste.append(combine_date_heure(cols))
        return pd.Series(liste)

    df['SCHEDULED_DEPARTURE'] = create_flight_time(df, 'SCHEDULED_DEPARTURE')
    df['DEPARTURE_TIME'] = df['DEPARTURE_TIME'].apply(format_heure)
    df['SCHEDULED_ARRIVAL'] = df['SCHEDULED_ARRIVAL'].apply(format_heure)
    df['ARRIVAL_TIME'] = df['ARRIVAL_TIME'].apply(format_heure)
    # __________________________________________________________________________
    # df.loc[:5, ['SCHEDULED_DEPARTURE', 'SCHEDULED_ARRIVAL', 'DEPARTURE_TIME',
    #             'ARRIVAL_TIME', 'DEPARTURE_DELAY', 'ARRIVAL_DELAY']]

    # The content of the **DEPARTURE_TIME** and **ARRIVAL_TIME** variables can be a bit misleading.
    # the first entry of the dataframe, the scheduled departure is at 0h05 the 1st of January.
    # ### 1.2 Filling factor
    #
    # Finally, the data frame is cleaned and few columns are dropped
    variables_to_remove = [
        'TAXI_OUT', 'TAXI_IN', 'WHEELS_ON', 'WHEELS_OFF', 'YEAR', 'MONTH',
        'DAY', 'DAY_OF_WEEK', 'DATE', 'AIR_SYSTEM_DELAY', 'SECURITY_DELAY',
        'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'DIVERTED',
        'CANCELLED', 'CANCELLATION_REASON', 'FLIGHT_NUMBER', 'TAIL_NUMBER',
        'AIR_TIME'
    ]
    df.drop(variables_to_remove, axis=1, inplace=True)
    df = df[[
        'AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
        'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'DEPARTURE_DELAY',
        'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME', 'ARRIVAL_DELAY', 'SCHEDULED_TIME',
        'ELAPSED_TIME'
    ]]
    # df[:5]

    missing_df = df.isnull().sum(axis=0).reset_index()
    missing_df.columns = ['variable', 'missing values']
    missing_df['filling factor (%)'] = (
        df.shape[0] - missing_df['missing values']) / df.shape[0] * 100
    missing_df.sort_values('filling factor (%)').reset_index(drop=True)

    # The filling factor is quite good (> 97%). So dropping the rows with NA is a good option
    df.dropna(inplace=True)

    # %%
    # # 2. Exploration
    # # 2.1 Basic statistical description of airlines

    # function for statistical parameters from a grouby object:
    def get_stats(group):
        return {
            'min': group.min(),
            'max': group.max(),
            'count': group.count(),
            'mean': group.mean()
        }

    global_stats = df['DEPARTURE_DELAY'].groupby(
        df['AIRLINE']).apply(get_stats).unstack()
    global_stats = global_stats.sort_values('count')
    global_stats

    # In[15]:

    # # 2.1 Graphs

    # Pie chart for

    font = {'family': 'normal', 'weight': 'bold', 'size': 15}
    mpl.rc('font', **font)

    # __________________________________________________________________
    # I extract a subset of columns and redefine the airlines labeling
    df2 = df.loc[:, ['AIRLINE', 'DEPARTURE_DELAY']]
    df2['AIRLINE'] = df2['AIRLINE'].replace(abbr_companies)
    # ________________________________________________________________________
    colors = [
        'royalblue', 'grey', 'wheat', 'c', 'firebrick', 'seagreen',
        'lightskyblue', 'lightcoral', 'yellowgreen', 'gold', 'tomato',
        'violet', 'aquamarine', 'chartreuse'
    ]
    # ___________________________________
    fig = plt.figure(1, figsize=(16, 15))
    gs = GridSpec(2, 1)
    ax1 = fig.add_subplot(gs[0, 0])
    ax2 = fig.add_subplot(gs[1, 0])
    labels = [s for s in global_stats.index]
    # ----------------------------------------
    # Pie chart for mean delay at departure
    # ----------------------------------------
    sizes = global_stats['mean'].values
    sizes = [max(s, 0) for s in sizes]
    explode = [
        0.0 if sizes[i] < 20000 else 0.01 for i in range(len(abbr_companies))
    ]
    patches, texts, autotexts = ax1.pie(
        sizes,
        explode=explode,
        labels=labels,
        colors=colors,
        shadow=False,
        startangle=0,
        autopct=lambda p: '{:.0f}'.format(p * sum(sizes) / 100))
    for i in range(len(abbr_companies)):
        texts[i].set_fontsize(14)
    ax1.axis('equal')
    ax1.set_title('Mean delay at origin',
                  bbox={
                      'facecolor': 'midnightblue',
                      'pad': 5
                  },
                  color='w',
                  fontsize=18)
    # ------------------------------------------------------
    # striplot with all the values for the delays
    # ___________________________________________________________________
    # Defining the colors for correspondance with the pie charts
    colors = [
        'firebrick', 'gold', 'lightcoral', 'aquamarine', 'c', 'yellowgreen',
        'grey', 'seagreen', 'tomato', 'violet', 'wheat', 'chartreuse',
        'lightskyblue', 'royalblue'
    ]
    # ___________________________________________________________________
    ax2 = sns.stripplot(y="AIRLINE",
                        x="DEPARTURE_DELAY",
                        size=4,
                        palette=colors,
                        data=df2,
                        linewidth=0.5,
                        jitter=True)
    plt.setp(ax2.get_xticklabels(), fontsize=14)
    plt.setp(ax2.get_yticklabels(), fontsize=14)
    ax2.set_xticklabels([
        '{:2.0f}h{:2.0f}m'.format(*[int(y) for y in divmod(x, 60)])
        for x in ax2.get_xticks()
    ])
    plt.xlabel('Departure delay',
               fontsize=18,
               bbox={
                   'facecolor': 'midnightblue',
                   'pad': 5
               },
               color='w',
               labelpad=20)
    ax2.yaxis.label.set_visible(False)
    # ________________________
    plt.tight_layout(w_pad=3)

    # If we Exclude Hawaiian Airlines and Alaska Airlines, which have low mean delays, the mean delay would be 11 ± 7 minutes
    # The second graph shows that, incase of mean delay being 11 minutes, there might be hours delay for some flights

    # In[16]:

    # # 2.1 Graphs

    # Function defining how delays are grouped
    delay_type = lambda x: ((0, 1)[x > 5], 2)[x > 45]
    df['DELAY_LEVEL'] = df['DEPARTURE_DELAY'].apply(delay_type)

    fig = plt.figure(1, figsize=(10, 7))
    ax = sns.countplot(y="AIRLINE", hue='DELAY_LEVEL', data=df)

    # We replace the abbreviations by the full names of the companies and set the labels
    labels = [abbr_companies[item.get_text()] for item in ax.get_yticklabels()]
    ax.set_yticklabels(labels)
    plt.setp(ax.get_xticklabels(), fontsize=12, weight='normal', rotation=0)
    plt.setp(ax.get_yticklabels(), fontsize=12, weight='bold', rotation=0)
    ax.yaxis.label.set_visible(False)
    plt.xlabel('Flight count', fontsize=16, weight='bold', labelpad=10)

    # Set the legend
    L = plt.legend()
    L.get_texts()[0].set_text('on time (t < 5 min)')
    L.get_texts()[1].set_text('small delay (5 < t < 45 min)')
    L.get_texts()[2].set_text('large delay (t > 45 min)')
    plt.show()

    # %%

    # # 2.2 Impact of Departure Vs Arrival Delays

    mpl.rcParams.update(mpl.rcParamsDefault)
    mpl.rcParams['hatch.linewidth'] = 2.0

    fig = plt.figure(1, figsize=(11, 6))
    ax = sns.barplot(x="DEPARTURE_DELAY",
                     y="AIRLINE",
                     data=df,
                     color="lightskyblue",
                     ci=None)
    ax = sns.barplot(x="ARRIVAL_DELAY",
                     y="AIRLINE",
                     data=df,
                     color="r",
                     hatch='///',
                     alpha=0.0,
                     ci=None)
    labels = [abbr_companies[item.get_text()] for item in ax.get_yticklabels()]
    ax.set_yticklabels(labels)
    ax.yaxis.label.set_visible(False)
    plt.xlabel('Mean delay [min] (@departure: blue, @arrival: hatch lines)',
               fontsize=14,
               weight='bold',
               labelpad=10)

    # This figure shows arrival delays are lower than departure delays.
    # The arrival delays can be compensated during air travel.

    # So for this project we have estimating the departure delays.

    # %%

    # ### 2.2 Vizualization for delays at origin airports

    airport_mean_delays = pd.DataFrame(pd.Series(
        df['ORIGIN_AIRPORT'].unique()))
    airport_mean_delays.set_index(0, drop=True, inplace=True)

    for carrier in abbr_companies.keys():
        df1 = df[df['AIRLINE'] == carrier]
        test = df1['DEPARTURE_DELAY'].groupby(
            df['ORIGIN_AIRPORT']).apply(get_stats).unstack()
        airport_mean_delays[carrier] = test.loc[:, 'mean']

    temp_airports = airports
    identify_airport = temp_airports.set_index('IATA_CODE')['CITY'].to_dict()

    sns.set(context="paper")
    fig = plt.figure(1, figsize=(8, 8))

    ax = fig.add_subplot(1, 2, 1)
    subset = airport_mean_delays.iloc[:50, :].rename(columns=abbr_companies)
    subset = subset.rename(index=identify_airport)
    mask = subset.isnull()
    sns.heatmap(subset,
                linewidths=0.01,
                cmap="Accent",
                mask=mask,
                vmin=0,
                vmax=35)
    plt.setp(ax.get_xticklabels(), fontsize=10, rotation=85)
    ax.yaxis.label.set_visible(False)

    ax = fig.add_subplot(1, 2, 2)
    subset = airport_mean_delays.iloc[50:100, :].rename(columns=abbr_companies)
    subset = subset.rename(index=identify_airport)
    fig.text(0.5,
             1.02,
             "Delays: impact of the origin airport",
             ha='center',
             fontsize=18)
    mask = subset.isnull()
    sns.heatmap(subset,
                linewidths=0.01,
                cmap="Accent",
                mask=mask,
                vmin=0,
                vmax=35)
    plt.setp(ax.get_xticklabels(), fontsize=10, rotation=85)
    ax.yaxis.label.set_visible(False)

    plt.tight_layout()

    # From the above graph, we deduce
    # American eagle has large delays
    # Delta airlines has delays less than 5 minutes
    # Few airports favour late departure,like Denver, Chicago

    # In[32]:

    # Common class for graphs
    class Figure_style():
        # _________________________________________________________________
        def __init__(self, size_x=11, size_y=5, nrows=1, ncols=1):
            sns.set_style("white")
            sns.set_context("notebook",
                            font_scale=1.2,
                            rc={"lines.linewidth": 2.5})
            self.fig, axs = plt.subplots(nrows=nrows,
                                         ncols=ncols,
                                         figsize=(
                                             size_x,
                                             size_y,
                                         ))
            # ________________________________
            # convert self.axs to 2D array
            if nrows == 1 and ncols == 1:
                self.axs = np.reshape(axs, (1, -1))
            elif nrows == 1:
                self.axs = np.reshape(axs, (1, -1))
            elif ncols == 1:
                self.axs = np.reshape(axs, (-1, 1))

        # _____________________________
        def pos_update(self, ix, iy):
            self.ix, self.iy = ix, iy

        # _______________
        def style(self):
            self.axs[self.ix, self.iy].spines['right'].set_visible(False)
            self.axs[self.ix, self.iy].spines['top'].set_visible(False)
            self.axs[self.ix, self.iy].yaxis.grid(color='lightgray',
                                                  linestyle=':')
            self.axs[self.ix, self.iy].xaxis.grid(color='lightgray',
                                                  linestyle=':')
            self.axs[self.ix, self.iy].tick_params(axis='both',
                                                   which='major',
                                                   labelsize=10,
                                                   size=5)

        # ________________________________________
        def draw_legend(self, location='upper right'):
            legend = self.axs[self.ix, self.iy].legend(loc=location,
                                                       shadow=True,
                                                       facecolor='g',
                                                       frameon=True)
            legend.get_frame().set_facecolor('whitesmoke')

        # _________________________________________________________________________________
        def cust_plot(self,
                      x,
                      y,
                      color='b',
                      linestyle='-',
                      linewidth=1,
                      marker=None,
                      label=''):
            if marker:
                markerfacecolor, marker, markersize = marker[:]
                self.axs[self.ix,
                         self.iy].plot(x,
                                       y,
                                       color=color,
                                       linestyle=linestyle,
                                       linewidth=linewidth,
                                       marker=marker,
                                       label=label,
                                       markerfacecolor=markerfacecolor,
                                       markersize=markersize)
            else:
                self.axs[self.ix, self.iy].plot(x,
                                                y,
                                                color=color,
                                                linestyle=linestyle,
                                                linewidth=linewidth,
                                                label=label)
            self.fig.autofmt_xdate()

        # ________________________________________________________________________
        def cust_plot_date(self,
                           x,
                           y,
                           color='lightblue',
                           linestyle='-',
                           linewidth=1,
                           markeredge=False,
                           label=''):
            markeredgewidth = 1 if markeredge else 0
            self.axs[self.ix,
                     self.iy].plot_date(x,
                                        y,
                                        color='lightblue',
                                        markeredgecolor='grey',
                                        markeredgewidth=markeredgewidth,
                                        label=label)

        # ________________________________________________________________________
        def cust_scatter(self,
                         x,
                         y,
                         color='lightblue',
                         markeredge=False,
                         label=''):
            markeredgewidth = 1 if markeredge else 0
            self.axs[self.ix, self.iy].scatter(x,
                                               y,
                                               color=color,
                                               edgecolor='grey',
                                               linewidths=markeredgewidth,
                                               label=label)
            #

        def set_xlabel(self, label, fontsize=14):
            self.axs[self.ix, self.iy].set_xlabel(label, fontsize=fontsize)

        def set_ylabel(self, label, fontsize=14):
            self.axs[self.ix, self.iy].set_ylabel(label, fontsize=fontsize)

        # ____________________________________
        def set_xlim(self, lim_inf, lim_sup):
            self.axs[self.ix, self.iy].set_xlim([lim_inf, lim_sup])

        # ____________________________________
        def set_ylim(self, lim_inf, lim_sup):
            self.axs[self.ix, self.iy].set_ylim([lim_inf, lim_sup])

    # Sampling the data with 80:20 training and test data set
    df_train = df.sample(frac=0.8)
    df_test = df.loc[~df.index.isin(df_train.index)]
    df = df_train

    # In[37]:
    # Defining dataframe creation function
    ###########################################################################
    def get_flight_delays(df, carrier, id_airport, extrem_values=False):
        df2 = df[(df['AIRLINE'] == carrier)
                 & (df['ORIGIN_AIRPORT'] == id_airport)]
        # _______________________________________
        # remove extreme values before fitting
        if extrem_values:
            df2['DEPARTURE_DELAY'] = df2['DEPARTURE_DELAY'].apply(
                lambda x: x if x < 60 else np.nan)
            df2.dropna(how='any')
        # __________________________________

        df2.sort_values('SCHEDULED_DEPARTURE', inplace=True)
        df2['schedule_depart'] = df2['SCHEDULED_DEPARTURE'].apply(
            lambda x: x.time())
        # ___________________________________________________________________

        test2 = df2['DEPARTURE_DELAY'].groupby(
            df2['schedule_depart']).apply(get_stats).unstack()
        test2.reset_index(inplace=True)
        # ___________________________________

        fct = lambda x: x.hour * 60 + x.minute
        test2.reset_index(inplace=True)
        test2['schedule_depart_mnts'] = test2['schedule_depart'].apply(fct)
        return test2

    def create_df(df, carrier, id_airport, extrem_values=False):
        df2 = df[(df['AIRLINE'] == carrier)
                 & (df['ORIGIN_AIRPORT'] == id_airport)]
        df2.dropna(how='any', inplace=True)
        df2['weekday'] = df2['SCHEDULED_DEPARTURE'].apply(
            lambda x: x.weekday())
        # ____________________
        # delete delays > 1h
        df2['DEPARTURE_DELAY'] = df2['DEPARTURE_DELAY'].apply(
            lambda x: x if x < 60 else np.nan)
        df2.dropna(how='any', inplace=True)
        # _________________
        # formating times
        fct = lambda x: x.hour * 60 + x.minute
        df2['schedule_depart'] = df2['SCHEDULED_DEPARTURE'].apply(
            lambda x: x.time())
        df2['schedule_depart_mnts'] = df2['schedule_depart'].apply(fct)
        df2['schedule_arrivee'] = df2['SCHEDULED_ARRIVAL'].apply(fct)
        df3 = df2.groupby(['schedule_depart_mnts', 'schedule_arrivee'],
                          as_index=False).mean()
        return df3

    #
    # In[39]:
    # Linear Regression
    ####### Linear_Train #######

    test2 = get_flight_delays(df, carrier, id_airport, False)
    test2.to_csv('Model_dataset.csv', sep=',')

    test = test2[['mean', 'schedule_depart_mnts']].dropna(how='any', axis=0)
    X_L_train = np.array(test['schedule_depart_mnts'])
    Y_L_train = np.array(test['mean'])
    X_L_train = X_L_train.reshape(len(X_L_train), 1)
    Y_L_train = Y_L_train.reshape(len(Y_L_train), 1)
    regr = linear_model.LinearRegression()
    regr.fit(X_L_train, Y_L_train)
    result_L_train = regr.predict(X_L_train)
    score_L_train = regr.score(X_L_train, Y_L_train)

    # print("R^2 for Linear Train= ",score_L_train)
    print("MSE Linear Train=",
          metrics.mean_squared_error(result_L_train, Y_L_train))

    # The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares
    # ((y_true - y_pred) ** 2).sum() and v is the
    # total sum of squares ((y_true - y_true.mean()) ** 2).sum().

    ####### Linear_Test #######
    test2 = get_flight_delays(df_test, carrier, id_airport, False)

    test = test2[['mean', 'schedule_depart_mnts']].dropna(how='any', axis=0)
    X_L_test = np.array(test['schedule_depart_mnts'])
    Y_L_test = np.array(test['mean'])
    X_L_test = X_L_test.reshape(len(X_L_test), 1)
    Y_L_test = Y_L_test.reshape(len(Y_L_test), 1)
    result_L_test = regr.predict(X_L_test)
    score_L_test = regr.score(X_L_test, Y_L_test)

    # print("R^2 for Linear Test= ",score_L_test)
    print("MSE Linear Test=",
          metrics.mean_squared_error(result_L_test, Y_L_test))
    fig1 = Figure_style(8, 4, 1, 1)
    fig1.pos_update(0, 0)
    # fig1.cust_scatter(df1['heure_depart'], df1['DEPARTURE_DELAY'], markeredge = True)
    fig1.cust_plot(X_L_test,
                   Y_L_test,
                   color='b',
                   linestyle=':',
                   linewidth=2,
                   marker=('b', 's', 10))
    fig1.cust_plot(X_L_test, result_L_test, color='g', linewidth=3)
    fig1.style()
    fig1.set_ylabel('Delay (minutes)', fontsize=14)
    fig1.set_xlabel('Departure time', fontsize=14)
    # ____________________________________
    # convert and set the x ticks labels
    fct_convert = lambda x: (int(x / 3600), int(divmod(x, 3600)[1] / 60))
    fig1.axs[fig1.ix, fig1.iy].set_xticklabels([
        '{:2.0f}h{:2.0f}m'.format(*fct_convert(x))
        for x in fig1.axs[fig1.ix, fig1.iy].get_xticks()
    ])

    # In[77]:
    # Ridge Regression
    ####### Ridge_Training #######
    df3 = get_flight_delays(df, carrier, id_airport)
    df3[:5]
    # df1 = df[(df['AIRLINE'] == carrier) & (df['ORIGIN_AIRPORT'] == id_airport)]
    # df1['heure_depart'] =  df1['SCHEDULED_DEPARTURE'].apply(lambda x:x.time())
    # df1['heure_depart'] = df1['heure_depart'].apply(lambda x:x.hour*60+x.minute)
    df3 = df3[['mean', 'schedule_depart_mnts']].dropna(how='any', axis=0)
    X = np.array(df3['schedule_depart_mnts'])
    Y = np.array(df3['mean'])
    X = X.reshape(len(X), 1)
    Y = Y.reshape(len(Y), 1)

    parameters = [0.2, 1]
    ridgereg = Ridge(alpha=parameters[0], normalize=True)
    poly = PolynomialFeatures(degree=parameters[1])
    X_ = poly.fit_transform(X)
    ridgereg.fit(X_, Y)
    result_R_train = ridgereg.predict(X_)
    score_R_train = metrics.mean_squared_error(result_R_train, Y)
    r2_R_train = regr.score(X, Y)
    # print("R^2 for Ridge Train:",r2_R_train )
    print('MSE Ridge Train= {}'.format(round(score_R_train, 2)))

    ####### Ridge_Test #######

    df3 = get_flight_delays(df_test, carrier, id_airport)
    df3[:5]

    test = df3[['mean', 'schedule_depart_mnts']].dropna(how='any', axis=0)
    X_L_test = np.array(test['schedule_depart_mnts'])
    Y_L_test = np.array(test['mean'])
    X_testt = X.reshape(len(X), 1)
    Y_testt = Y.reshape(len(Y), 1)

    X_ = poly.fit_transform(X_testt)
    result_test = ridgereg.predict(X_)

    score_R_test = metrics.mean_squared_error(result_test, Y_testt)

    r2_ridge_test = r2_score(X_testt, Y_testt)
    # print("R^2 for Ridge Test is: ",r2_ridge_test )
    print('MSE Ridge Test = {}'.format(round(np.sqrt(score_R_test), 2)))
    # 'Ecart = {:.2f} min'.format(np.sqrt(score_R_test))

    fig1 = Figure_style(8, 4, 1, 1)
    fig1.pos_update(0, 0)
    # fig1.cust_scatter(df1['heure_depart'], df1['DEPARTURE_DELAY'], markeredge = True)
    fig1.cust_plot(X_testt,
                   Y_testt,
                   color='b',
                   linestyle=':',
                   linewidth=2,
                   marker=('b', 's', 10))
    fig1.cust_plot(X_testt, result_test, color='g', linewidth=3)
    fig1.style()
    fig1.set_ylabel('Delay (minutes)', fontsize=14)
    fig1.set_xlabel('Departure time', fontsize=14)
    # ____________________________________
    # convert and set the x ticks labels
    fct_convert = lambda x: (int(x / 3600), int(divmod(x, 3600)[1] / 60))
    fig1.axs[fig1.ix, fig1.iy].set_xticklabels([
        '{:2.0f}h{:2.0f}m'.format(*fct_convert(x))
        for x in fig1.axs[fig1.ix, fig1.iy].get_xticks()
    ])

    # %%
    ###########################################################################
    ####### Random Forest_Train #######
    df4 = create_df(df, carrier, id_airport)
    # X_rf_Train = np.array(df3[['schedule_depart','schedule_arrivee', 'ARRIVAL_DELAY', 'SCHEDULED_TIME','ELAPSED_TIME','weekday']])
    # X_rf_Train = np.hstack((X_rf_Train))
    df4 = df4[['DEPARTURE_DELAY', 'schedule_depart_mnts']].dropna(how='any',
                                                                  axis=0)
    X_rf_Train = np.array(df4['schedule_depart_mnts'])
    Y_rf_Train = np.array(df4['DEPARTURE_DELAY'])

    X_rf_Train = X_rf_Train.reshape(len(X_rf_Train), 1)
    Y_rf_Train = Y_rf_Train.reshape(len(Y_rf_Train), 1)

    rf = RandomForestRegressor(n_estimators=100,
                               oob_score=True,
                               random_state=123456)
    rf.fit(X_rf_Train, Y_rf_Train)

    predicted_train = rf.predict(X_rf_Train)

    test_score = r2_score(Y_rf_Train, predicted_train)
    spearman = spearmanr(Y_rf_Train, predicted_train)
    # pearson = pearsonr(Y_rf_Train, predicted_train)

    # print(f'Out-of-bag R-2 score estimate: {rf.oob_score_:>5.3}')
    # print(f'Test data R-2 score: {test_score:>5.3}')
    # print(f'Test data Spearman correlation: {spearman[0]:.3}')

    # print("R^2 for RF Train:",test_score )
    print('MSE RF Train= {}'.format(
        round(metrics.mean_squared_error(predicted_train, Y_rf_Train), 2)))
    # print(f'Test data Pearson correlation: {pearson[0]:.3}')

    ####### Random Forest_Test #######
    df41 = create_df(df_test, carrier, id_airport)
    # X_rf_Train = np.array(df3[['schedule_depart','schedule_arrivee', 'ARRIVAL_DELAY', 'SCHEDULED_TIME','ELAPSED_TIME','weekday']])
    # X_rf_Train = np.hstack((X_rf_Train))
    df41 = df41[['DEPARTURE_DELAY', 'schedule_depart_mnts']].dropna(how='any',
                                                                    axis=0)
    X_rf_Test = np.array(df41['schedule_depart_mnts'])
    Y_rf_Test = np.array(df41['DEPARTURE_DELAY'])

    X_rf_Test = X_rf_Test.reshape(len(X_rf_Test), 1)
    Y_rf_Test = Y_rf_Test.reshape(len(Y_rf_Test), 1)

    predicted_test = rf.predict(X_rf_Test)

    test_score = r2_score(Y_rf_Test, predicted_test)
    spearman = spearmanr(Y_rf_Test, predicted_test)
    # pearson = pearsonr(Y_rf_Train, predicted_train)

    # print(f'Out-of-bag R-2 score estimate: {rf.oob_score_:>5.3}')
    # print(f'Test data R-2 score: {test_score:>5.3}')
    # print(f'Test data Spearman correlation: {spearman[0]:.3}')

    score_rf_test = r2_score(X_rf_Test, Y_rf_Test)
    # print("R^2 for RF Test: ",score_rf_test )
    score_RF_test = metrics.mean_squared_error(predicted_test, Y_rf_Test)
    print(' MSE RF Test = {}'.format(round(score_RF_test, 2)))

    fig1 = Figure_style(8, 4, 1, 1)
    fig1.pos_update(0, 0)
    # fig1.cust_scatter(df1['heure_depart'], df1['DEPARTURE_DELAY'], markeredge = True)
    fig1.cust_plot(X_rf_Test,
                   Y_rf_Test,
                   color='b',
                   linestyle=':',
                   linewidth=2,
                   marker=('b', 's', 10))
    fig1.cust_plot(X_rf_Test, predicted_test, color='g', linewidth=3)
    fig1.style()
    fig1.set_ylabel('Delay (minutes)', fontsize=14)
    fig1.set_xlabel('Departure time', fontsize=14)
    # ____________________________________
    # convert and set the x ticks labels
    fct_convert = lambda x: (int(x / 3600), int(divmod(x, 3600)[1] / 60))
    fig1.axs[fig1.ix, fig1.iy].set_xticklabels([
        '{:2.0f}h{:2.0f}m'.format(*fct_convert(x))
        for x in fig1.axs[fig1.ix, fig1.iy].get_xticks()
    ])

    # %%
    ###########################################################################
    ####### Neural Network_Train #######

    df5 = create_df(df, carrier, id_airport)
    # X_rf_Train = np.array(df3[['schedule_depart','schedule_arrivee', 'ARRIVAL_DELAY', 'SCHEDULED_TIME','ELAPSED_TIME','weekday']])
    # X_rf_Train = np.hstack((X_rf_Train))
    df5 = df5[['DEPARTURE_DELAY', 'schedule_depart_mnts']].dropna(how='any',
                                                                  axis=0)
    X_nn_Train = np.array(df5['schedule_depart_mnts'])
    Y_nn_Train = np.array(df5['DEPARTURE_DELAY'])

    X_nn_Train = X_nn_Train.reshape(len(X_nn_Train), 1)
    Y_nn_Train = Y_nn_Train.reshape(len(Y_nn_Train), 1)

    regr = LinearSVR(random_state=0)
    #    from sknn.mlp import Classifier, Layer
    #    #regr = LinearSVR(random_state=0)
    #    regr = Classifier(
    #    layers=[
    #        Layer("Rectifier", units=10),
    #        Layer("Linear")],
    #    learning_rate=0.02,
    #    n_iter=5)
    regr.fit(X_nn_Train, Y_nn_Train)

    predict_train_NN = regr.predict(X_nn_Train)

    r2_NN_train = r2_score(Y_nn_Train, predict_train_NN)
    # print("R^2 for NN Train:",r2_NN_train )
    print('MSE NN Train= {}'.format(
        round(metrics.mean_squared_error(predict_train_NN, Y_nn_Train), 2)))

    ####### Neural Network_Test #######
    df51 = create_df(df_test, carrier, id_airport)
    # X_rf_Train = np.array(df3[['schedule_depart','schedule_arrivee', 'ARRIVAL_DELAY', 'SCHEDULED_TIME','ELAPSED_TIME','weekday']])
    # X_rf_Train = np.hstack((X_rf_Train))
    df51 = df51[['DEPARTURE_DELAY', 'schedule_depart_mnts']].dropna(how='any',
                                                                    axis=0)
    X_NN_Test = np.array(df51['schedule_depart_mnts'])
    Y_NN_Test = np.array(df51['DEPARTURE_DELAY'])

    X_NN_Test = X_NN_Test.reshape(len(X_NN_Test), 1)
    Y_NN_Test = Y_NN_Test.reshape(len(Y_NN_Test), 1)

    predict_test_NN = regr.predict(X_NN_Test)

    score_NN_test = r2_score(X_NN_Test, Y_NN_Test)
    # print("R^2 for NN Test: ",score_NN_test )
    MSE_NN_test = metrics.mean_squared_error(predict_test_NN, Y_NN_Test)
    print('MSE NN Test = {}'.format(round(MSE_NN_test, 2)))

    fig1 = Figure_style(8, 4, 1, 1)
    fig1.pos_update(0, 0)
    # fig1.cust_scatter(df1['heure_depart'], df1['DEPARTURE_DELAY'], markeredge = True)
    fig1.cust_plot(X_NN_Test,
                   Y_NN_Test,
                   color='b',
                   linestyle=':',
                   linewidth=2,
                   marker=('b', 's', 10))
    fig1.cust_plot(X_NN_Test, predict_test_NN, color='g', linewidth=3)
    fig1.style()
    fig1.set_ylabel('Delay (minutes)', fontsize=14)
    fig1.set_xlabel('Departure time', fontsize=14)

    # convert and set the x ticks labels
    fct_convert = lambda x: (int(x / 3600), int(divmod(x, 3600)[1] / 60))
    fig1.axs[fig1.ix, fig1.iy].set_xticklabels([
        '{:2.0f}h{:2.0f}m'.format(*fct_convert(x))
        for x in fig1.axs[fig1.ix, fig1.iy].get_xticks()
    ])

    # %%

    ###########################################################################
    ####### SVM_Train #######

    df6 = create_df(df, carrier, id_airport)
    df6 = df6[['DEPARTURE_DELAY', 'schedule_depart_mnts']].dropna(how='any',
                                                                  axis=0)
    X_svm_Train = np.array(df6['schedule_depart_mnts'])
    Y_svm_Train = np.array(df6['DEPARTURE_DELAY'])

    X_svm_Train = X_svm_Train.reshape(len(X_svm_Train), 1)
    Y_svm_Train = Y_svm_Train.reshape(len(Y_svm_Train), 1)

    regr = SVR(kernel='linear')

    regr.fit(X_svm_Train, Y_svm_Train)

    predict_train_svm = regr.predict(X_svm_Train)
    r2_svm_train = r2_score(Y_nn_Train, predict_train_svm)
    # print("R^2 for svm Train:",r2_svm_train )
    print('MSE svm Train= {}'.format(
        round(metrics.mean_squared_error(predict_train_svm, Y_svm_Train), 2)))

    ####### SVM_Test #######
    df61 = create_df(df_test, carrier, id_airport)
    # X_rf_Train = np.array(df3[['schedule_depart','schedule_arrivee', 'ARRIVAL_DELAY', 'SCHEDULED_TIME','ELAPSED_TIME','weekday']])
    # X_rf_Train = np.hstack((X_rf_Train))
    df61 = df61[['DEPARTURE_DELAY', 'schedule_depart_mnts']].dropna(how='any',
                                                                    axis=0)
    X_svm_Test = np.array(df61['schedule_depart_mnts'])
    Y_svm_Test = np.array(df61['DEPARTURE_DELAY'])

    X_svm_Test = X_svm_Test.reshape(len(X_svm_Test), 1)
    Y_svm_Test = Y_svm_Test.reshape(len(Y_svm_Test), 1)

    predict_test_svm = regr.predict(X_svm_Test)

    r2_svm_test = r2_score(X_svm_Test, Y_svm_Test)
    # print("R^2 for svm Test: ",r2_svm_test )
    mse_svm_test = metrics.mean_squared_error(predict_test_svm, Y_svm_Test)
    print('MSE svm Test= {}'.format(round(mse_svm_test, 2)))

    fig1 = Figure_style(8, 4, 1, 1)
    fig1.pos_update(0, 0)
    # fig1.cust_scatter(df1['heure_depart'], df1['DEPARTURE_DELAY'], markeredge = True)
    fig1.cust_plot(X_svm_Test,
                   Y_svm_Test,
                   color='b',
                   linestyle=':',
                   linewidth=2,
                   marker=('b', 's', 10))
    fig1.cust_plot(X_svm_Test, predict_test_svm, color='g', linewidth=3)
    fig1.style()
    fig1.set_ylabel('Delay (minutes)', fontsize=14)
    fig1.set_xlabel('Departure time', fontsize=14)
    # ____________________________________
    # convert and set the x ticks labels
    fct_convert = lambda x: (int(x / 3600), int(divmod(x, 3600)[1] / 60))
    fig1.axs[fig1.ix, fig1.iy].set_xticklabels([
        '{:2.0f}h{:2.0f}m'.format(*fct_convert(x))
        for x in fig1.axs[fig1.ix, fig1.iy].get_xticks()
    ])

    return np.mean(result_L_test), np.mean(result_test), np.mean(
        predicted_test), np.mean(predict_test_NN), np.mean(predict_test_svm)
示例#33
0
## Create K folds
k_fold = KFold(Y_train_raw.shape[0], n_folds=10)
for train, test in k_fold:
    X1 = X_train_reduced[train]
    Y1 = Y_train_raw[train]

    X2 = X_train_reduced[test]
    Y2 = Y_train_raw[test]

    ## Train Classifiers on fold
    rdg_clf = Ridge(alpha=0.5)
    rdg_clf.fit(X1, Y1)
    lso_clf = Lasso(alpha=0.6257)
    lso_clf.fit(X1, Y1)
    svr_clf = LinearSVR(C=1e3)
    svr_clf.fit(X1, Y1)

    ## Score Classifiers on fold
    rdg_clf_score = rdg_clf.score(X2, Y2)
    lso_clf_score = lso_clf.score(X2, Y2)
    svr_clf_score = svr_clf.score(X2, Y2)

    print "Ridge:  ", rdg_clf_score
    print "Lasso:  ", lso_clf_score
    print "SVR_RBF:  ", svr_clf_score


## Train final Classifiers
# clf = Ridge(alpha=.5)
clf = LinearSVR(C=1e3, gamma=0.1)
clf.fit(X_train_reduced, Y_train_raw)
示例#34
0
def linear_svr(dataframe,
               target=None,
               drop_features=[],
               without_outliers=False,
               split=0.2):
    warnings.filterwarnings("ignore",
                            category=ConvergenceWarning,
                            message="^Liblinear failed to converge")

    # Remove non-numerical and undesired features from dataframe
    dataframe = dataframe.loc[:, dataframe.dtypes != 'object']
    dataframe = dataframe.drop(drop_features, axis=1)

    # Transform data into columns and define target variable
    numerical_features = dataframe.loc[:, dataframe.columns != target]
    X = np.nan_to_num(
        numerical_features.to_numpy())  # .reshape(numerical_features.shape)
    y = np.nan_to_num(dataframe[target].to_numpy()
                      )  # .reshape(dataframe[target].shape[0], 1)

    # Split the data into training/testing sets
    testsplit = round(split * X.shape[0])
    X_train = X[:-testsplit]
    X_test = X[-testsplit:]
    y_train = y[:-testsplit]
    y_test = y[-testsplit:]

    # Train linear regression model
    reg = LinearSVR(random_state=0, tol=1e-5)
    reg.fit(X_train, y_train)
    feature_importance = pd.Series(
        reg.coef_[0],
        index=numerical_features.columns)  # only with linear kernel

    # Prediction with trained model
    y_pred = reg.predict(X_test)

    results = pd.DataFrame()
    results['Train mean'] = np.mean(y_train)
    results['Train std'] = np.std(y_train)
    results['Test mean'] = np.mean(y_test)
    results['Test std'] = np.std(y_test)
    results['Prediction mean'] = np.mean(y_pred)
    results['Prediction std'] = np.std(y_pred)
    results['Mean Squared Error'] = mean_squared_error(y_test, y_pred)
    results['Mean Absolute Error'] = mean_absolute_error(y_test, y_pred)
    results['R2 score'] = r2_score(y_test, y_pred)
    results['Explained variance score'] = explained_variance_score(
        y_test, y_pred)
    results['Cross-val R2 score (mean)'] = np.mean(
        cross_val_score(reg, X, y, cv=10, scoring="r2"))
    results['Cross-val R2 scores'] = cross_val_score(reg,
                                                     X,
                                                     y,
                                                     cv=10,
                                                     scoring="r2")
    results['Cross-val explained_variance score (mean)'] = np.mean(
        cross_val_score(reg, X, y, cv=10, scoring="explained_variance"))
    results['Cross-val explained_variance scores'] = cross_val_score(
        reg, X, y, cv=10, scoring="explained_variance")

    y_result = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})
    return feature_importance, results, y_result, reg
示例#35
0
class ESN_linear_svr_learner():
    def __init__(self,
                 n_readout=1000,
                 n_components=100,
                 damping=0.5,
                 weight_scaling=0.9,
                 discard_steps=0,
                 random_state=None,
                 epsilon=0.0,
                 C=1.0,
                 max_iter=1000):
        self.n_readout = n_readout
        self.n_components = n_components
        self.damping = damping
        self.weight_scaling = weight_scaling
        self.discard_steps = discard_steps
        self.random_state = random_state
        self.epsilon = epsilon
        self.C = C
        self.max_iter = max_iter

        self.ESN = SimpleESN(n_readout=self.n_readout,
                             n_components=self.n_components,
                             damping=self.damping,
                             weight_scaling=self.weight_scaling,
                             discard_steps=self.discard_steps,
                             random_state=check_random_state(
                                 self.random_state))
        self.Linear_SVR = LinearSVR(epsilon=self.epsilon,
                                    tol=1e-4,
                                    C=self.C,
                                    loss='epsilon_insensitive',
                                    fit_intercept=True,
                                    intercept_scaling=1.,
                                    dual=True,
                                    verbose=0,
                                    random_state=None,
                                    max_iter=self.max_iter)

    def fit(self, X, y):
        self.ESN.fit(X)
        self.Linear_SVR.fit(self.ESN.transform(X), y)
        return self

    def predict(self, X):
        return self.Linear_SVR.predict(self.ESN.transform(X))

    def get_params(self, deep=True):
        if deep:
            params = {
                'n_readout': self.n_readout,
                'n_components': self.n_components,
                'damping': self.damping,
                'weight_scaling': self.weight_scaling,
                'discard_steps': self.discard_steps,
                'random_state': self.random_state,
                'epsilon': self.epsilon,
                'C': self.C,
                'max_iter': self.max_iter
            }
            return params
        else:
            params = {
                'n_readout': self.n_readout,
                'n_components': self.n_components,
                'damping': self.damping,
                'weight_scaling': self.weight_scaling
            }
            return params
示例#36
0
train_X_Headline = hstack(
    [train_vect_2_hst, csr_matrix(train_headline.values)])
test_X_Headline = hstack([test_vect_2_hst, csr_matrix(test_headline.values)])
y2 = train['SentimentHeadline']

np.shape(train_X_Title)

#model for sentiment title
X_train, X_test, y_train, y_test = train_test_split(train_X_Title,
                                                    y1,
                                                    test_size=0.20,
                                                    random_state=42)

LSVR1 = LinearSVR(C=0.2)
LSVR1.fit(X_train, y_train)

y_pred1 = LSVR1.predict(X_test)
mae1 = mean_absolute_error(y_pred1, y_test)
print('MAE:', 1 - mae1)

X_train, X_test, y_train, y_test = train_test_split(train_X_Headline,
                                                    y2,
                                                    test_size=0.20,
                                                    random_state=42)

LSVR2 = LinearSVR(C=0.1)
LSVR2.fit(X_train, y_train)

y_pred2 = LSVR2.predict(X_test)
mae2 = mean_absolute_error(y_pred2, y_test)
示例#37
0
# # Regression
# SVM algorithm can also be used for regression - instead of finding the street
# with the fewest instance violations, it tries to find the street with the
# most instance violations

# Let's generate some linearly random data
np.random.seed(42)
m = 50
X = 2 * np.random.rand(m, 1)
y = (4 + 3 * X + np.random.randn(m, 1)).ravel()

# Train an SVR algorithm
from sklearn.svm import LinearSVR

svm_reg = LinearSVR(epsilon=1.5, random_state=42)
svm_reg.fit(X, y)

svm_reg1 = LinearSVR(epsilon=1.5, random_state=42)
svm_reg2 = LinearSVR(epsilon=0.5, random_state=42)
svm_reg1.fit(X, y)
svm_reg2.fit(X, y)


def find_support_vectors(svm_reg, X, y):
    y_pred = svm_reg.predict(X)
    off_margin = (np.abs(y - y_pred) >= svm_reg.epsilon)
    return np.argwhere(off_margin)


svm_reg1.support_ = find_support_vectors(svm_reg1, X, y)
svm_reg2.support_ = find_support_vectors(svm_reg2, X, y)
示例#38
0
    # Exercise 10 P166

    # data set
    housing = fetch_california_housing()
    X = housing["data"]
    y = housing["target"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # scale
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # build model
    lin_svr = LinearSVR(random_state=42)
    lin_svr.fit(X_train_scaled, y_train)

    y_pred = lin_svr.predict(X_train_scaled)
    mse = mean_squared_error(y_train, y_pred)
    print('LinearSVR MSE: ', mse)  # 0.949968822217229 not good
    print('LinearSVR RMSE: ', np.sqrt(mse))

    # grid search the best estimator with SVR() model which can use kernel skill
    param_distributions = {"gamma": reciprocal(0.001, 0.1), "C": uniform(1, 10)}
    rnd_search_cv = RandomizedSearchCV(SVR(), param_distributions, n_iter=10, verbose=2, random_state=42)
    rnd_search_cv.fit(X_train_scaled, y_train)

    print('best estimator: ', rnd_search_cv.best_estimator_)
    '''SVR(C=4.745401188473625, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
      gamma=0.07969454818643928, kernel='rbf', max_iter=-1, shrinking=True,
      tol=0.001, verbose=False)
示例#39
0
# Impute missing values
imputer = SimpleImputer(strategy="mean")
imputer.fit(X_train)
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

print("Normalizing...")
# Normalize feature values using MinMaxScaler
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

print("Training...")
if algorithm == "svr":
    reg = LinearSVR(C=10, random_state=42, verbose=1, max_iter=10000)
elif algorithm == "tree":
    reg = DecisionTreeRegressor(random_state=42, criterion="mse")
elif algorithm == "knn":
    reg = KNeighborsRegressor(n_neighbors=5)
elif algorithm == "forest":
    reg = RandomForestRegressor(n_estimators=100, criterion="mse", n_jobs=12)

reg.fit(X_train, y_train)

y_pred = reg.predict(X_test)

print("MAE: ", mean_absolute_error(y_test, y_pred))
print("MAPE: ", mean_absolute_percentage_error(y_test, y_pred))
print("RMSE: ", mean_squared_error(y_test, y_pred, squared=False))
# 	plot_predictions(svm_clf, [-1.5, 2.5, -1, -1.5])
# 	plot_dataset(X, y, [-1.5, 2.5, -1, 1.5])
# 	gamma, C = hyperparams[i]
# 	plt.title(r"$\gamma = {}, C={}$".format(gamma, C), fontsize=16)

# plt.show()


rnd.seed(42)
m = 50
X = 2 * rnd.rand(m, 1)
y = (4 + 3 * X + rnd.randn(m, 1)).ravel()

svm_reg1 = LinearSVR(epsilon=1.5)
svm_reg2 =LinearSVR(epsilon=0.5)
svm_reg1.fit(X, y)
svm_reg2.fit(X, y)



def find_support_vectors(svm_reg, X, y):
	y_pred = svm_reg.predict(X)
	off_margin = (np.abs(y - y_pred) >= svm_reg.epsilon)
	return np.argwhere(off_margin)

svm_reg1.support_ = find_support_vectors(svm_reg1, X, y)
svm_reg2.support_ = find_support_vectors(svm_reg2, X, y)

eps_x1 = 1
eps_y_pred = svm_reg1.predict([[eps_x1]])
示例#41
0
文件: main.py 项目: vpellegrain/Plume
ps = PredefinedSplit(NO2_test_fold)
param_grid = {
    # "kernel": ["rbf"],
    "C": [0.1, 1, 10],
    "epsilon": [0.1]
}
svr = LinearSVR(C=10)
gs = GridSearchCV(svr,
                  param_grid,
                  scoring="neg_mean_squared_error",
                  n_jobs=1,
                  iid=False,
                  refit=True,
                  cv=ps)

svr.fit(preprocessing.normalize(X_NO2), Y_NO2)
gs.fit(preprocessing.normalize(X_NO2), Y_NO2)

evaluate_mse(svr, preprocessing.normalize(NO2_train_f),
             preprocessing.normalize(NO2_dev_f), Y_NO2_train, Y_NO2_dev)

# SGD Regressor
ps = PredefinedSplit(NO2_test_fold)
param_grid = {
    "loss": ["squared_loss", "huber"],
    # "penalty": ["l2", "l1"],
    "penalty": ["l2", "l1"],
    "alpha": [0.0001, 0.001, 1, 10],
    "shuffle": [True, False],
    "n_iter": [10]
}
示例#42
0
cat_vars = ['DayOfWeek','Promo','StateHoliday','SchoolHoliday','StoreType','Assortment','CompetitionOpenSinceMonth',
            'CompetitionOpenSinceYear','Promo2','Promo2SinceWeek','Promo2SinceYear','PromoInterval','Day','Month','Year']


num_vars = ['Open','Store','CompetitionDistance','ratio1','ratio2']



X_trn, X_val = train_test_split(train, test_size=0.012, random_state=10)

print 'Training Stage 1 Models'

#train svm
svm1 = LinearSVR(verbose=True)
svm1.fit(X_trn[cat_vars+num_vars],X_trn['Sales'])
svm1_feature = svm1.predict(train[cat_vars+num_vars])
preds = svm1.predict(X_val[cat_vars+num_vars])
print 'svm ',(np.mean(((np.exp(preds)-np.exp(X_val['Sales']))/(np.exp(X_val['Sales'])+1))**2))**0.5


#train xgb
dtrain = xgb.DMatrix(X_trn[cat_vars+num_vars],X_trn['Sales'])
dvalid = xgb.DMatrix(X_val[cat_vars+num_vars],X_val['Sales'])
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

num_boost_round = 50
params1 = {"objective": "reg:linear","booster" : "gbtree",
"eta": 0.5,"max_depth": 2,"subsample": 0.5,"colsample_bytree": 0.4,
"nthread":4,"silent": 1,"seed": 1301}
gbm1 = xgb.train(params1, dtrain, num_boost_round, evals=watchlist,early_stopping_rounds=50, feval=rmspe_xg, verbose_eval=True)
if __name__ == '__main__':

    # NOTE: Make sure that the outcome column is labeled 'target' in the data file
    url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/auto-insurance.csv'
    dataframe = pd.read_csv(url, header=None)
    # split into input and output elements
    data = dataframe.values
    data = data.astype('float32')
    X, y = data[:, :-1], data[:, -1]

    training_features, testing_features, training_target, testing_target = \
                train_test_split(X, y, random_state=1)

    # Average CV score on the training set was: -29.116294532472594
    exported_pipeline = LinearSVR(C=15.0,
                                  dual=False,
                                  epsilon=0.0001,
                                  loss="squared_epsilon_insensitive",
                                  tol=0.001)
    # Fix random state in exported estimator
    if hasattr(exported_pipeline, 'random_state'):
        setattr(exported_pipeline, 'random_state', 1)

    exported_pipeline.fit(training_features, training_target)
    results = exported_pipeline.predict(testing_features)

    # make a prediction on a new row of data
    row = [108]
    yhat = exported_pipeline.predict([row])
    print('Predicted: %.3f' % yhat[0])
	combined = np.append(X, np.matrix(Y).T, axis=1) 
	np.random.shuffle(combined)
	tail_size = -1 * size
	last_column = X.shape[1]
	training_labels = combined[:tail_size, last_column]
	training_data = combined[:tail_size, :-2]
	test_data = combined[tail_size:, :-2]
	actual_labels = combined[tail_size:, last_column]
	return training_data, np.ravel(training_labels), test_data, np.ravel(actual_labels)

training = open('author_features')
NO_TRAINING_SAMPLES = 6000
NO_OF_AUTHORS = 10000
matrix = dok_matrix((NO_TRAINING_SAMPLES, NO_OF_AUTHORS), dtype=np.int)
for line in training.readlines():
	values = line.rstrip().split()
	matrix[int(values[0]), int(values[1])] = 1

labels_file = open('year_training_labels')
labels = [int(x) for x in labels_file.readline().rstrip().split()]

training_matrix = matrix[:4498]
training_data, training_labels, test_data, actual_labels = sample(training_matrix, labels)
classifier = LinearSVR()
classifier.fit(training_data, training_labels)
output = classifier.predict(test_data)
for index, predicted in enumerate(output):
	print '%s %s' % (predicted, actual_labels[index])

print metrics.explained_variance_score(actual_labels, output)
(x_input, y_input) = get_training_data(feature_lin_lambda=feature_lin_lambda, feature_lin_var=feature_lin_var, data_exp=data_exp)

# 对属性进行归一化
x_scaler = preprocessing.MinMaxScaler(feature_range=(-1,1))  

###  y 需不需要进行归一化?没有归一化的理由,但影响结果!!!
# y_scaler = preprocessing.MinMaxScaler(feature_range=(-1,1))    
# x_input_minmax = x_scaler.fit_transform(x_input)
# y_input_minmax = y_scaler.fit_transform(y_input.reshape(-1,1))
# y_input_minmax = y_input_minmax.reshape((len(y_input_minmax)))
# 通过交叉验证来选择C
best_cv_score = -1e+30;
for log2c in np.arange(-10,30,1):
    clf = LinearSVR(C=2**log2c, epsilon=0.0001)
    clf.fit(x_input_minmax, y_input)
    cv_score = cross_val_score(cv=sample_num, estimator=clf, X=x_input_minmax, y=y_input, scoring= 'mean_squared_error').mean() # 留1
    print(cv_score)
    if cv_score > best_cv_score:
        best_cv_score = cv_score
        bestc = 2**log2c


# 利用所选的参数进行预测
clf = LinearSVR(C=bestc, epsilon=0.0001)
clf.fit(x_input_minmax, y_input)
y_pred = clf.predict(x_input_minmax)
# y_pred = y_scaler.inverse_transform(y_pred.reshape(-1,1))

view_point = 5;
plt.plot(x_input[:,view_point], y_input, 'bo-', x_input[:,view_point], y_pred, 'rs-')
示例#46
0
    train_cluster = pd.DataFrame(columns=('x1', 'x2', 'x3', 'x4', 'x5', 'y'))
    
    for i in range(0,(len(cluster_i) - 5)):
       train_cluster.loc[i] = [cluster_i.iloc[i], cluster_i.iloc[i+1], 
                            cluster_i.iloc[i+2], cluster_i.iloc[i+3], 
                            cluster_i.iloc[i+4], cluster_i.iloc[i+5]]
                            
    explanatory_features = [col for col in train_cluster.columns if col not in ['y']]
    explanatory_df = np.array(train_cluster[explanatory_features])
    
    response_series = np.array(train_cluster.y)
                            
    ### SUPPORT VECTOR REGRESSION MODEL

    linsvr = LinearSVR(epsilon=0.1, tol=1e-4, C=1.0, loss='squared_epsilon_insensitive')
    linsvr.fit(explanatory_df, response_series)
    linsvr_rsq[c] = svr.score(explanatory_df, response_series)
    
    # prediction and linear extrapolation of training data set to get further predictions.
    test_cluster = train_cluster.copy()
    
    explanatory_testdf = test_cluster[explanatory_features]
    response_testseries = test_cluster.y
    
    for i in range(0,(len(cluster_i) - 5)):
       test_cluster.loc[i] = [cluster_i.iloc[i], cluster_i.iloc[i+1], 
                            cluster_i.iloc[i+2], cluster_i.iloc[i+3], 
                            cluster_i.iloc[i+4],
                            linsvr.predict(explanatory_df)[i]]
    
    # further running time series to predict into the future
示例#47
0
class LinearSvrClass:
    """
    Name      : LinearSVR
    Attribute : None
    Method    : predict, predict_by_cv, save_model
    """
    def __init__(self):
        # 알고리즘 이름
        self._name = 'linearsvr'

        # 기본 경로
        self._f_path = os.path.abspath(
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         os.pardir))

        # 경고 메시지 삭제
        warnings.filterwarnings('ignore')

        # 원본 데이터 로드
        data = pd.read_csv(self._f_path +
                           "/regression/resource/regression_sample.csv",
                           sep=",",
                           encoding="utf-8")

        # 학습 및 테스트 데이터 분리
        self._x = (data["year"] <= 2017)
        self._y = (data["year"] >= 2018)

        # 학습 데이터 분리
        self._x_train, self._y_train = self.preprocessing(data[self._x])
        # 테스트 데이터 분리
        self._x_test, self._y_test = self.preprocessing(data[self._y])

        # 모델 선언
        self._model = LinearSVR(max_iter=500, tol=1e-5)

        # 모델 학습
        self._model.fit(self._x_train, self._y_train)

    # 데이터 전처리
    def preprocessing(self, data):
        # 학습
        x = []
        # 레이블
        y = []
        # 기준점(7일)
        base_interval = 7
        # 기온
        temps = list(data["temperature"])

        for i in range(len(temps)):
            if i < base_interval:
                continue
            y.append(temps[i])

            xa = []

            for p in range(base_interval):
                d = i + p - base_interval
                xa.append(temps[d])
            x.append(xa)
        return x, y

    # 일반 예측
    def predict(self, save_img=False, show_chart=False):
        # 예측
        y_pred = self._model.predict(self._x_test)

        # 스코어 정보
        score = r2_score(self._y_test, y_pred)

        # 리포트 확인
        if hasattr(self._model, 'coef_') and hasattr(self._model,
                                                     'intercept_'):
            print(f'Coef = {self._model.coef_}')
            print(f'intercept = {self._model.intercept_}')

        print(f'Score = {score}')

        # 이미지 저장 여부
        if save_img:
            self.save_chart_image(y_pred, show_chart)

        # 예측 값  & 스코어
        return [list(y_pred), score]

    #  CV 예측(Cross Validation)
    def predict_by_cv(self):
        # Regression 알고리즘은 실 프로젝트 상황에 맞게 Cross Validation 구현
        return False

    #  GridSearchCV 예측
    def predict_by_gs(self):
        pass

    # 모델 저장 및 갱신
    def save_model(self, renew=False):
        # 모델 저장
        if not renew:
            # 처음 저장
            joblib.dump(self._model,
                        self._f_path + f'/model/{self._name}_rg.pkl')
        else:
            # 기존 모델 대체
            if os.path.isfile(self._f_path + f'/model/{self._name}_rg.pkl'):
                os.rename(
                    self._f_path + f'/model/{self._name}_rg.pkl',
                    self._f_path +
                    f'/model/{str(self._name) + str(time.time())}_rg.pkl')
            joblib.dump(self._model,
                        self._f_path + f'/model/{self._name}_rg.pkl')

    # 회귀 차트 저장
    def save_chart_image(self, data, show_chart):
        # 사이즈
        plt.figure(figsize=(15, 10), dpi=100)

        # 레이블
        plt.plot(self._y_test, c='r')

        # 예측 값
        plt.plot(data, c='b')

        # 이미지로 저장
        plt.savefig('./chart_images/tenki-kion-lr.png')

        # 차트 확인(Optional)
        if show_chart:
            plt.show()

    def __del__(self):
        del self._x_train, self._x_test, self._y_train, self._y_test, self._x, self._y, self._model
示例#48
0
def linearSVR(train,trainLable,testData):
    clf = LinearSVR()  
    clf.fit(train,trainLable)  
    predict = clf.predict(testData)  
    return predict  
md=dnn_reg(X_train,y_train,X_test,y_test)
reg_eval(X_test,y_test,md)

###Lasso CV regression

def reg_eval2(y_test,model):
    y_pred=model.predict(X_test)
    print("evaluation the results for model:",model)
    print("MSE:",mean_squared_error(y_test,y_pred))
    print("R2:",r2_score(y_test,y_pred))
    print("EVS:",explained_variance_score(y_test,y_pred))

lasso = LassoCV(cv=5, random_state=0,max_iter=10000)
lasso.fit(X_train,y_train)
reg_eval2(y_test,lasso)

#ElasticNet Regressionb
ela = ElasticNetCV(l1_ratio=0.8,normalize=True,max_iter=5000,random_state=77)
ela.fit(X_train,y_train)
print("R square:",ela.score(X_test,y_test))
reg_eval2(y_test,ela)


#SVR Regression
from sklearn.svm import LinearSVR
LSVR=LinearSVR(epsilon=0.1,random_state=0, tol=1e-5,max_iter=10000)
# scaler=RobustScaler()
# pipe=Pipeline(steps=[("scaling",scaler),("rg",LSVR)])
LSVR.fit(X_train,y_train)
reg_eval2(y_test,LSVR))
    trainingSet = train[~idx]
    validationSet = train[idx]

    tr_X = np.matrix(trainingSet[feature_names])
    tr_Y = np.array(trainingSet["Response"])
    val_X = np.matrix(validationSet[feature_names])
    val_Y = np.array(validationSet["Response"])

    regm = LinearSVR(C=0.06,
                     epsilon=0.45,
                     tol=1e-5,
                     dual=True,
                     verbose=True,
                     random_state=133)

    regm.fit(tr_X, tr_Y)
    preds = regm.predict(val_X)

    df = pd.DataFrame(
        dict({
            "Id": validationSet["Id"],
            "ground_truth": validationSet["Response"],
            "linsvr_preds": preds
        }))

    linsvr_val = linsvr_val.append(df, ignore_index=True)

    tpreds = regm.predict(test_X)
    cname = "Fold" + ` i `
    linsvr_test[cname] = tpreds
示例#51
0
# Tuning models and test for all features 
# Linear Regression
linreg = LinearRegression()
linreg.fit(X_train, y_train)
acc_model(0,linreg,X_train,X_test)    
print("Done")

# Support Vector Machines
svr = SVR()
svr.fit(X_train, y_train)
acc_model(1,svr,X_train,X_test)
print("Done")

# Linear SVR
linear_svr = LinearSVR()
linear_svr.fit(X_train, y_train)
acc_model(2,linear_svr,X_train,X_test)
print("Done")

# MLPRegressor
mlp = MLPRegressor()
param_grid = {'hidden_layer_sizes': [i for i in range(2,20)],
              'activation': ['relu'],
              'solver': ['adam'],
              'learning_rate': ['constant'],
              'learning_rate_init': [0.01],
              'power_t': [0.5],
              'alpha': [0.0001],
              'max_iter': [1000],
              'early_stopping': [True],
              'warm_start': [False]}
示例#52
0
class SVMTextEncoder(BaseEstimator, TransformerMixin):
    # number of jobs to execute in parallel
    NUM_JOBS = 3
    # number of folds to apply to svm fit
    NUM_FOLDS = 3

    # !! add tuning
    def __init__(self, metric, random_seed):
        super().__init__()

        self._vect = TfidfVectorizer(ngram_range=[1, 2], max_features=30000)
        self._random_seed = random_seed

        if metric in classification_metrics:
            self._model = LinearSVC(class_weight="balanced",
                                    random_state=random_seed)
            self.mode = "classification"
        elif metric in regression_metrics:
            self._model = LinearSVR(random_state=random_seed)
            self.mode = "regression"
        else:
            raise AttributeError(
                "metric not in classification or regression metrics")

    def fit(self, X, y):
        raise NotImplemented

    def transform(self, X):
        X = pd.Series(X.squeeze()).fillna(MISSING_VALUE_INDICATOR).values

        Xv = self._vect.transform(X)
        if self.mode == "classification":
            out = self._model.decision_function(Xv)
        else:
            out = self._model.predict(Xv)

        if len(out.shape) == 1:
            out = out.reshape(-1, 1)

        return out

    def fit_transform(self, X, y=None, **kwargs):
        assert y is not None, "SVMTextEncoder.fit_transform requires y"

        X = pd.Series(X.squeeze()).fillna(MISSING_VALUE_INDICATOR).values
        Xv = self._vect.fit_transform(X)
        self._model = self._model.fit(Xv, y)

        if self.mode == "classification":
            # Aim for NUM_FOLDS and stratified k-fold.  If that doesn't work, fallback to uniform sampling.
            num_folds = min(self.NUM_FOLDS, y.value_counts().min())
            if num_folds < 2:
                cv = KFold(n_splits=self.NUM_FOLDS,
                           random_state=self._random_seed)
                out = cross_val_predict(
                    self._model,
                    Xv,
                    y,
                    method="decision_function",
                    n_jobs=self.NUM_JOBS,
                    cv=cv,
                )
            else:
                out = cross_val_predict(
                    self._model,
                    Xv,
                    y,
                    method="decision_function",
                    n_jobs=self.NUM_JOBS,
                    cv=num_folds,
                )
        else:
            out = cross_val_predict(self._model,
                                    Xv,
                                    y,
                                    n_jobs=self.NUM_JOBS,
                                    cv=self.NUM_FOLDS)

        if len(out.shape) == 1:
            out = out.reshape(-1, 1)

        return out