def sample_from_generator(history, nb_samples, latent_dim=12, 
                          valid_split=0.3, random_split=True,
                          hidden_dims=None, **kwargs):
    scaler = MinMaxScaler()
    scaler.fit(history)
    scaled = scaler.transform(history)
    
    nb_train = history.shape[0]    
    if not valid_split:
        nb_valid = 0
    elif isinstance(valid_split, float):
        nb_valid = nb_train - int(np.floor(nb_train*valid_split))
    else:
        nb_valid = valid_split
        
    if nb_valid > 0:
        if random_split:
            ind = np.arange(nb_train)
            np.random.shuffle(ind)
            x_valid = scaled[ind[-nb_valid:], :]
            x_train = scaled[ind[:-nb_valid], :]
        else:
            x_valid = scaled[-nb_valid:, :]
            x_train = scaled[:-nb_valid, :]
    else:
        x_valid = None
        x_train = scaled
    
    _, generator = build_model(latent_dim, x_train, x_valid=x_valid, 
                               hidden_dims=hidden_dims, **kwargs)
    
    normal_sample = np.random.standard_normal((nb_samples, latent_dim))
    draws = generator.predict(normal_sample)
    return scaler.inverse_transform(draws)
def scale_data(pitchers):
    num_data = pitchers[['Decisions', 'Wins_Over_Decisions',
                         'Wins_Over_Starts', 'Relief_Appearances',
                         'Shutout_Percentage', 'Outs_Recorded_Per_Appearance',
                         'Hits_Allowed_Per_Appearance', 'Earned_Runs_Per_Appearance',
                         'Runs_Per_Appearance', 'Home_Runs_Per_Appearance',
                         'Walks_Per_Appearance', 'Strikeouts_Per_Appearance',
                         'ERA']]
                         
    scaler = MinMaxScaler()
    scaler.fit(num_data)
    num_data = scaler.transform(num_data)
    num_data = pd.DataFrame(num_data)

    num_data.columns = ['Decisions', 'Wins_Over_Decisions',
                         'Wins_Over_Starts', 'Relief_Appearances',
                         'Shutout_Percentage', 'Outs_Recorded_Per_Appearance',
                         'Hits_Allowed_Per_Appearance', 'Earned_Runs_Per_Appearance',
                         'Runs_Per_Appearance', 'Home_Runs_Per_Appearance',
                         'Walks_Per_Appearance', 'Strikeouts_Per_Appearance',
                         'ERA']
                         
    pitchers = pitchers[['Player_and_Year']]
    
    pitchers = pd.merge(pitchers, num_data, how='inner', left_index=True,
                        right_index=True)
    
    return pitchers
def data_organizer( instances, outcomes ):
   """
   Operations to organize data as desired
   """
   
   excluded_features = set([])
   #print( "Using only SAT subject tests" )
   #included_features = set(["SATCRDG",	"SATMATH",	"SATWRTG"])
   
   #print( "Using SAT total and HSGPA" )
   #included_features = set(["SATTotal",	"HSGPA"])
   
   #print( "Using gender, firstgen, famincome, firstlang" )
   #included_features = set(["gender", "Firgen", "famincome", "FirstLang"])
   
   print( "Using all features" )
   included_features = set(["gender", "Firgen", "famincome",	"SATCRDG",	"SATMATH",	"SATWRTG",	"SATTotal",	"HSGPA",	"ACTRead",	"ACTMath",	"ACTEngWrit",	"APIScore",	"FirstLang",	"HSGPAunweighted"])

   #print( "SAT subject tests and HSGPA" )
   #included_features = set(["SATCRDG",	"SATMATH",	"SATWRTG", "HSGPA" ])


   # Remove instances without GPA data
   new_instances = []
   new_outcomes = []
   for instance,outcome in zip(instances,outcomes):
      temp={}
      for name,val in zip(ALL_LABELS, instance):
         temp[name] = val
      u1,u2,gpa = outcome
      if not math.isnan( gpa ):
         temp_list = []
         skip = False
         for key in temp.keys():
            if key in included_features:
               if math.isnan(temp[key]):
                  skip = True
               temp_list.append( temp[key] )
         if not skip:
            new_outcomes.append( [value for value in outcome] )
            new_instances.append( temp_list )
         
         
   instances = new_instances
   outcomes = new_outcomes

   
   # Fill in NaN values with median
   instance_list = []
   for idx,instance in enumerate(instances):
      instance_list.append( [ value for value in instance ] ) 
   bandaid = Imputer( strategy='median' )
   instances = bandaid.fit_transform( instance_list )
   
   # Scale to [0,1]
   scaler = MinMaxScaler( feature_range=(0,1), copy=False)
   scaler.fit( instances )
   instances = scaler.fit_transform(instances)

   return instances, outcomes, scaler
示例#4
0
    def transform(self, fp):
        fm = FeaturePool(fp).meta()
        x = FeaturePool(fp).array()

        scaler = MinMaxScaler(feature_range = self.feature_range)
        scaler.fit(x)
        for f in FeaturePool.from_array(fm, scaler.transform(x)):
            yield f
def preprocess_datasets(X_train, X_test, args):
    if 'scale' in args.preprocessing:
        print('Scaling features to range [-1,1] ...')
        scaler = MinMaxScaler(feature_range=(-1, 1))
        scaler.fit(np.vstack(X_train))
        X_train = [scaler.transform(X_curr) for X_curr in X_train]
        X_test = [scaler.transform(X_curr) for X_curr in X_test]
    return X_train, X_test
示例#6
0
def preprocess_datasets(train, test, args):
    if 'scale' in args.preprocessing:
        print('Scaling features to range [-1,1] ...')
        scaler = MinMaxScaler(feature_range=(-1, 1))
        scaler.fit(np.vstack(train.X))
        processed_train = Dataset([scaler.transform(X_curr) for X_curr in train.X], train.y, train.target_names, train.groups)
        processed_test = Dataset([scaler.transform(X_curr) for X_curr in test.X], test.y, test.target_names, test.groups)
    else:
        processed_train = train
        processed_test = test
    return processed_train, processed_test
def test_minmaxscaler_vs_sklearn():
    # Compare msmbuilder.preprocessing.MinMaxScaler
    # with sklearn.preprocessing.MinMaxScaler

    minmaxscalerr = MinMaxScalerR()
    minmaxscalerr.fit(np.concatenate(trajs))

    minmaxscaler = MinMaxScaler()
    minmaxscaler.fit(trajs)

    y_ref1 = minmaxscalerr.transform(trajs[0])
    y1 = minmaxscaler.transform(trajs)[0]

    np.testing.assert_array_almost_equal(y_ref1, y1)
示例#8
0
def preprocess_data(X, scaler=None):
    if not scaler:
        
        #add log to data
        X = np.log(1+X)
        
        scaler = MinMaxScaler()
        scaler.fit(X)
    X = scaler.transform(X)
    #add gaussian noise    
    mu, sigma = 0, 0.1 # mean and standard deviation
    s = np.random.normal(mu, sigma)
    #X = X + s
    return X, scaler
示例#9
0
class SerialDataScaler:
    
    def __init__(self, data):
        data = numpy.reshape(data, (len(data), 1))
        data = data.astype("float32")
        self.scaler = MinMaxScaler(feature_range=(0, 1))
        self.scaler.fit(data)
    
    def transform(self, X):
        #return X
        return self.scaler.transform(numpy.reshape(X, (len(X), 1)))

    def inverse_transform(self, x):
        return self.scaler.inverse_transform(x)
示例#10
0
def organize_data(train_size=59872):
    #Used 59872, which is 80%, rounded in a fashion to use large mini-batches that align in size

    with open('dev_df.pkl', 'r') as f:
        dev_df = pd.DataFrame(cPickle.load(f))
    
    # Training/CV set
    gender_age_train = pd.read_csv('gender_age_train.csv', index_col=0).drop(['gender', 'age'], axis=1)
    gender_age_train = gender_age_train.join(dev_df)
    
    # Test set
    gender_age_test = pd.read_csv('gender_age_test.csv', index_col=0)
    gender_age_test = gender_age_test.join(dev_df)
    
    # Labels will be in y array; features will be in X matrix; need to encode labels
    # for phone_brand, device_model, and group
    X = np.array(gender_age_train)
    X_test = np.array(gender_age_test)
    
    # Row 0 is the group to be classified, so put it in y array then delete it
    y = X[:,0]
    from sklearn.preprocessing import LabelEncoder
    le_y = LabelEncoder()
    y = le_y.fit_transform(y)
    X = np.delete(X,0,1)
    
    # Reformat all labeled columns with label encoders
    le_phone_brand = LabelEncoder()
    le_phone_brand.fit(np.hstack((X[:,0], X_test[:,0])))
    X[:,0] = le_phone_brand.transform(X[:,0])
    X_test[:,0] = le_phone_brand.transform(X_test[:,0])
    
    le_device_model = LabelEncoder()
    le_device_model.fit(np.hstack((X[:,1], X_test[:,1])))
    X[:,1] = le_device_model.transform(X[:,1])
    X_test[:,1] = le_device_model.transform(X_test[:,1])
    
    # Standardize features
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(np.vstack((X, X_test)))
    X = scaler.transform(X)
    X_test = scaler.transform(X_test)
    
    # Create CV set
    from sklearn.cross_validation import train_test_split
    
    X_train, X_cv, y_train, y_cv = train_test_split(X, y, train_size=train_size, random_state=0)
    return X_train, X_cv, y_train, y_cv, X_test
示例#11
0
class log_minmax(sklearn.base.BaseEstimator,
                       sklearn.base.TransformerMixin):
    '''Transformer that first takes log1p(X) then calls the minMaxScaler transformer'''
    def __init__(self):
        self.mm_tran = MinMaxScaler()
    
    def fit(self, X, y=None):
        self.mm_tran.fit(np.log1p(X),y)       
        return self

    def transform(self, X):
        Xt = self.mm_tran.transform(np.log1p(X))
        return Xt
        
    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)
def pipeline():

        #
        test = data[data.watch==1]
        test_a_b = test[['item_id','store_code','a','b']]
        test_y = test.label
        test_x = test.drop(['label','watch','item_id','store_code','a','b'],axis=1)
        test_x.fillna(test_x.median(),inplace=True)

       
        train = data[(data.watch!=0)&(data.watch!=1)]
        
        train_y = train.label

        
        a = list(train.a)
        b = list(train.b)
        train_weight = []
        for i in range(len(a)):
            #train_weight.append(min(a[i],b[i]))
            train_weight.append(a[i]+b[i])
        train_weight = np.array(train_weight)

        train_x = train.drop(['label','watch','item_id','store_code','a','b'],axis=1)

        train_x.fillna(train_x.median(),inplace=True)

        scaler = MinMaxScaler()
        scaler.fit(train_x)
        train_x = scaler.transform(train_x)
        test_x = scaler.transform(test_x)
        
        #model = SVR(kernel='rbf',cache_size=2000,gamma=0.01,C=3.5)#1
        #model = SVR(kernel='rbf',cache_size=2000,gamma=0.01,C=3.5)#2
        #model = SVR(kernel='rbf',cache_size=2000,gamma=0.01,C=3.5)#3
        model = SVR(kernel='rbf',cache_size=2000,gamma=0.01,C=3.5)

	#train
	model.fit(train_x,train_y, sample_weight=train_weight)

	#predict test set
        test_a_b['pred'] = model.predict(test_x)
	test_a_b['y'] = test_y
	cost = cal_cost(test_y.values,test_a_b.pred.values,test_a_b.a.values,test_a_b.b.values)
        test_a_b.to_csv('test/val_{0}.csv'.format(cost[1]),index=None)
class NumericalFeatureMinMaxScaler(object):
    def __init__(self, feature_names):
        self.feature_names = feature_names
        self.scaler = None

    def fit(self, data):
        self.scaler = MinMaxScaler(copy=True)
        self.scaler.fit(np.asarray(data.loc[:, self.feature_names]))
        return self

    def transform(self, data):
        SCALE_SUFFIX = "%s_SCALED"
        scaled_data = self.scaler.transform(np.asarray(data.loc[:, self.feature_names]))
        scaled_data = pd.DataFrame(
            data=scaled_data, columns=[SCALE_SUFFIX % f for f in self.feature_names], index=data.index
        )
        data = pd.concat([data, scaled_data], axis=1)
        return data
示例#14
0
def feature_scaling(feature_matrix,target,reductor=None,scaler=None):
    lda = LDA(n_components=2)    
    minmax = MinMaxScaler(feature_range=(-1,1))
    if not reductor:
        reductor = lda.fit(feature_matrix,target)
    feature_matrix_lda = reductor.transform(feature_matrix)
    if not scaler:
        scaler = minmax.fit(feature_matrix_lda)
    feature_matrix_scaled = scaler.transform(feature_matrix_lda)
    return feature_matrix_scaled,reductor,scaler
示例#15
0
def organize_data_kf():
    with open('dev_df.pkl', 'r') as f:
        dev_df = pd.DataFrame(cPickle.load(f))
    
    # Training/CV set
    gender_age_train = pd.read_csv('gender_age_train.csv', index_col=0).drop(['gender', 'age'], axis=1)
    gender_age_train = gender_age_train.join(dev_df)
    
    # Test set
    gender_age_test = pd.read_csv('gender_age_test.csv', index_col=0)
    gender_age_test = gender_age_test.join(dev_df)
    
    # Labels will be in y array; features will be in X matrix; need to encode labels
    # for phone_brand, device_model, and group
    X = np.array(gender_age_train)
    X_test = np.array(gender_age_test)
    
    # Row 0 is the group to be classified, so put it in y array then delete it
    y = X[:,0]
    from sklearn.preprocessing import LabelEncoder
    le_y = LabelEncoder()
    y = le_y.fit_transform(y)
    X = np.delete(X,0,1)
    
    # Reformat all labeled columns with label encoders
    le_phone_brand = LabelEncoder()
    le_phone_brand.fit(np.hstack((X[:,0], X_test[:,0])))
    X[:,0] = le_phone_brand.transform(X[:,0])
    X_test[:,0] = le_phone_brand.transform(X_test[:,0])
    
    le_device_model = LabelEncoder()
    le_device_model.fit(np.hstack((X[:,1], X_test[:,1])))
    X[:,1] = le_device_model.transform(X[:,1])
    X_test[:,1] = le_device_model.transform(X_test[:,1])
    
    # Standardize features
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(np.vstack((X, X_test)))
    X = scaler.transform(X)
    X_test = scaler.transform(X_test)
    
    return X, y, X_test
def features_pca(features_fin, n_comp):
    scaler = MinMaxScaler()
    finance_scaler = scaler.fit(features_fin)
    features_fin = finance_scaler.transform(features_fin)

    pca = PCA(n_components=n_comp).fit(features_fin)
    #print 'pca', pca
    features_fin = pca.transform(features_fin)
    features_fin = np.array(features_fin)
    return pca, features_fin
def get_prediction_of_classifier(
        name, classifier,
        train_features_partial, train_labels_partial,
        test_features_partial,
    ):
    """Calculate prediction and write a result to the csv file
    Use partial or full data depending on situation
    @param train_labels_partial: maybe the same as train_features
    """
    #normalize
    normalizer = MinMaxScaler()
    normalizer.fit(np.concatenate([train_features_partial, test_features_partial])) # todo: check
    train_features_partial = normalizer.transform(train_features_partial)
    test_features_partial = normalizer.transform(test_features_partial)

    #predict
    classifier.fit(train_features_partial, train_labels_partial)
    test_labels_predicted = classifier.predict(test_features_partial)
    write_vector(name, test_labels_predicted)
    return test_labels_predicted
示例#18
0
 def predict(self, test_X):
     # fitting done here
     # not efficient on the long term
     test_X = np.array(test_X)
     enc = OneHotEncoder()
     scal = MinMaxScaler()
     data = np.vstack((self.train_X, test_X))
     enc.fit(self.get_cal(data))
     scal.fit(self.get_cant(data))
     
     new_train_X1 = enc.transform(self.get_cal(self.train_X))
     new_train_X2 = scal.transform(self.get_cant(self.train_X))
     new_train_X = scipy.sparse.hstack((new_train_X1, new_train_X2))
     new_test_X1 = enc.transform(self.get_cal(test_X))
     new_test_X2 = scal.transform(self.get_cant(test_X))
     new_test_X = scipy.sparse.hstack((new_test_X1, new_test_X2))
     
     self.model.fit(new_train_X, self.train_Y)
     R = self.model.predict(new_test_X)
     return R
示例#19
0
def scale(train, test):
    # fit scaler
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler = scaler.fit(train)
    # transform train
    train = train.reshape(train.shape[0], train.shape[1])
    train_scaled = scaler.transform(train)
    # transform test
    test = test.reshape(test.shape[0], test.shape[1])
    test_scaled = scaler.transform(test)
    return scaler, train_scaled, test_scaled
def splitTrainTest(data,Labels,train_percent,random_state, minmax=False):
    random.seed(random_state)
    indexList = range(len(data))
    random.shuffle(indexList)
    trainIndexList = indexList[:int(len(data)*train_percent)]
    testIndexList = indexList[int(len(data)*train_percent):] 
    train, trainLabels, test, testLabels = data[trainIndexList], Labels[trainIndexList], data[testIndexList], Labels[testIndexList]
    if minmax:
        fit = MinMaxScaler.fit(train)
        train = fit.transform(train)
        test = fit.transform(test)
    return train, trainLabels, test, testLabels
示例#21
0
def scale_data(train, test):

    scaler = MinMaxScaler(feature_range=(-1, 1))

    # determine max and min values on training set (per feature) (scale training set with it)
    scaler = scaler.fit(train)
    train_scaled = scaler.transform(train)

    # apply the found parameters to test set (DO NOT compute them again)
    test_scaled = scaler.transform(test)

    return train_scaled, test_scaled
    def transform_and_split(features, labels):
        ########
        # Takes in two dataframes for the features and labels of a dataset and
        # outputs a dictionary with training and keys relating to training testing sets for each
        ########
        print('Performing prelimianry datasplit')
        x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=33)

        scaler = StandardScaler()
        scaler.fit(x_train)
        x_train = scaler.transform(x_train)
        x_test = scaler.transform(x_test)

        scaler2 = MinMaxScaler()
        scaler2.fit(x_train)
        x_train = scaler2.transform(x_train)
        x_test = scaler2.transform(x_test)

        data_dict = {'x_test': x_test, 'x_train': x_train,
                     'y_test': y_test, 'y_train': y_train}
        return data_dict
示例#23
0
def plot_relative_scaling():
    # make synthetic data
    X, _ = make_blobs(n_samples=50, centers=5, random_state=4, cluster_std=2)
    # split it into training and test set
    X_train, X_test = train_test_split(X, random_state=5, test_size=.1)
    # plot the training and test set
    fig, axes = plt.subplots(1, 3, figsize=(13, 4))
    axes[0].scatter(X_train[:, 0], X_train[:, 1],
                    c='b', label="training set", s=60)
    axes[0].scatter(X_test[:, 0], X_test[:, 1], marker='^',
                    c='r', label="test set", s=60)
    axes[0].legend(loc='upper left')
    axes[0].set_title("original data")

    # scale the data using MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # visualize the properly scaled data
    axes[1].scatter(X_train_scaled[:, 0], X_train_scaled[:, 1],
                    c='b', label="training set", s=60)
    axes[1].scatter(X_test_scaled[:, 0], X_test_scaled[:, 1], marker='^',
                    c='r', label="test set", s=60)
    axes[1].set_title("scaled data")

    # rescale the test set separately, so that test set min is 0 and test set max is 1
    # DO NOT DO THIS! For illustration purposes only
    test_scaler = MinMaxScaler()
    test_scaler.fit(X_test)
    X_test_scaled_badly = test_scaler.transform(X_test)

    # visualize wrongly scaled data
    axes[2].scatter(X_train_scaled[:, 0], X_train_scaled[:, 1],
                    c='b', label="training set", s=60)
    axes[2].scatter(X_test_scaled_badly[:, 0], X_test_scaled_badly[:, 1], marker='^',
                    c='r', label="test set", s=60)
    axes[2].set_title("improperly scaled data")
示例#24
0
class Normalizer(BaseEstimator, TransformerMixin):
    def __init__(self, method='standardize'):
        """ Constructor
        @param method: method of normalization. The ones currently supported are:
                'standardize': (x-mean)/sd
                'rescale': (x-min)/(max-min)
        @return: nothing.
        """
        assert method in ['standardize', 'rescale'], 'Unexpected method %s'%method

        self.method = method

        if method == 'standardize':
            self._scaler = StandardScaler()
        else:
            self._scaler = MinMaxScaler()

    def fit(self, X, y=None, **params):
        """
        @return: the caller itself
        """

        self._scaler.fit(X, y)

        return self

    def transform(self, X, **params):
        """
        @return: transformed data
        """

        return self._scaler.transform(X)

    def fit_transform(self, X, y=None, **params):
        """
        @return: transformed data
        """

        return self._scaler.fit_transform(X, y, **params)
def pipeline():

        #
        test = data[data.watch==0]
        test_a_b = test[['item_id','store_code','a','b']]
        test_x = test.drop(['label','watch','item_id','store_code','a','b'],axis=1)
        test_x.fillna(test_x.median(),inplace=True)

       
        train = data[data.watch!=0]
        
        train_y = train.label

        
        a = list(train.a)
        b = list(train.b)
        train_weight = []
        for i in range(len(a)):
            #train_weight.append(min(a[i],b[i]))
            train_weight.append(a[i]+b[i])
        train_weight = np.array(train_weight)

        train_x = train.drop(['label','watch','item_id','store_code','a','b'],axis=1)

        train_x.fillna(train_x.median(),inplace=True)

        scaler = MinMaxScaler()
        scaler.fit(train_x)
        train_x = scaler.transform(train_x)
        test_x = scaler.transform(test_x)
        
        model = SVR(kernel='linear',cache_size=2000)

	#train
	model.fit(train_x,train_y, sample_weight=train_weight)

	#predict test set
        test_a_b['pred'] = model.predict(test_x)
        test_a_b.to_csv('test/test_all.csv',index=None)
    def get_input(self):
        # Input data.
        # Load the training, validation and test data into constants that are
        # attached to the graph.
        self.x_train, self.y_train,self.x_validation,self.y_validation = self.get_train_validationset()
        self.x_train, self.y_train,self.x_validation,self.y_validation = self.x_train.as_matrix(), self.y_train.as_matrix().reshape((-1,1)),\
                                                                         self.x_validation.as_matrix(),self.y_validation.as_matrix().reshape((-1,1))
#         self.x_train, self.y_train,self.x_validation,self.y_validation = self.x_train.astype(np.float32), self.y_train.astype(np.float32),\
#                                                                          self.x_validation.astype(np.float32),self.y_validation.astype(np.float32)
        sc = MinMaxScaler()
        sc.fit(self.x_train)
        self.x_train= sc.transform(self.x_train)
        self.x_validation= sc.transform(self.x_validation)
        
        self.inputlayer_num = len(self.get_used_features())
        self.outputlayer_num = 1
        
        # Input placehoolders
        with tf.name_scope('input'):
            self.x = tf.placeholder(tf.float32, [None, self.inputlayer_num], name='x-input')
            self.y_true = tf.placeholder(tf.float32, [None, self.outputlayer_num ], name='y-input')
        self.keep_prob = tf.placeholder(tf.float32, name='drop_out')
        
        return
示例#27
0
from sklearn.model_selection import train_test_split,cross_val_score
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.8, shuffle = True, random_state = 66 )

# 1
from tensorflow.keras.utils import to_categorical
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

print(x_train.shape)
print(x_test.shape)


from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train).reshape(142, 13) #x_train만 trans 후 바뀐수치 x_train에 다시넣기
x_test = scaler.transform(x_test).reshape(36, 13) #x_test 따로 trans  후 바뀐수치 x_test에 다시넣기
 

# 2
def build_model(drop = 0.5, optimizer = 'adam'):
    inputs = Input(shape=(13,), name='input')
    x = Dense(512, activation='relu', name='hidden1')(inputs)
    x = Dropout(drop)(x)
    x = Dense(256, activation='relu', name='hidden2')(x)
    x = Dropout(drop)(x)
    x = Dense(128,activation='relu', name='hidden3')(x)
    x = Dropout(drop)(x)
    outputs = Dense(3, activation='softmax', name='output')(x)
    model = Model(inputs=inputs, outputs=outputs)
示例#28
0
import plotly
import plotly.graph_objs as go
import plotly.plotly as py
import tensorflow as tf
from keras.layers import LSTM, Dense
from keras.models import Sequential
from keras.preprocessing.sequence import TimeseriesGenerator
from plotly.offline import plot
from sklearn.preprocessing import MinMaxScaler

filename = 'AggregatedData/Data_Combined.csv'
df = pd.read_csv(filename)
df['Mean'] = df['Mean'].astype('int32')

scaler = MinMaxScaler()
scaler.fit(df['Mean'].values.reshape((-1, 1)))
df['Mean'] = scaler.transform(df['Mean'].values.reshape((-1, 1))).reshape(-1)

# scaler = MinMaxScaler()
# scaler.fit(df['Count'].values.reshape((-1,1)))
# df['Count'] = scaler.transform(df['Count'].values.reshape((-1,1))).reshape(-1)

locs = df['Neighbourhood'].unique()
drop_locations = [
    'HARLEM-WEST', 'JAVITS CENTER', 'SOUTHBRIDGE', 'MANHATTAN-UNKNOWN',
    'ROOSEVELT ISLAND'
]
locations = []
for x in locs:
    if x not in drop_locations:
        locations.append(x)
# In[195]:

# Scatter plots for continuous data
print("Scatter plots for continuous data and the target variable")
import seaborn as sns
sns.set(style="ticks", color_codes=True)
g = sns.pairplot(df_for_plots[continuous_column])

plt.show()
# In[196]:

# Scaling/Normalization
print("Performing Scaling/Normalization of the dataset")
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(df[continuous_column])
df[continuous_column] = scaler.transform(df[continuous_column])

# In[222]:

# Probability Distribution of all the continuous values columns
print("Probability Distribution of all the continuous values columns")
df[continuous_column].hist()

plt.show()
# # Correlation based independence check for columns (drop columns > 0.3)

# In[202]:

print("Checking if there are any columns with correlation greater than 0.4")
corr = df.corr().iloc[:, :3]
示例#30
0
def main():
    # List files available
    print(os.listdir("../input/"))

    # Training data
    app_train = pd.read_csv('../input/application_train.csv')
    print('Training data shape: ', app_train.shape)
    app_train.head()
    # Testing data features
    app_test = pd.read_csv('../input/application_test.csv')
    print('Testing data shape: ', app_test.shape)
    app_test.head()
    # 样本不均衡
    target_counts = app_train['TARGET'].value_counts()
    target_ratio0 = target_counts[0] / (target_counts[0] + target_counts[1])
    target_ratio1 = target_counts[1] / (target_counts[0] + target_counts[1])
    print('target_ratio0=', target_ratio0, ', target_ratio1=', target_ratio1)
    app_train['TARGET'].astype(int).plot.hist()
    plt.xlabel('TARGET')
    #缺失值处理
    missing_values = missing_values_table(app_train)
    app_train = fillNanData(app_train, missing_values)
    app_test = fillNanData(app_test, missing_values)
    app_test = fillNanData(app_test, missing_values_table(app_test))
    # 类别分析
    # Number of each type of column
    print(app_train.dtypes.value_counts())
    print(app_test.dtypes.value_counts())
    # Number of unique classes in each object column
    print(
        app_train.select_dtypes(include=['object']).apply(pd.Series.nunique,
                                                          axis=0))
    print(
        app_test.select_dtypes(include=['object']).apply(pd.Series.nunique,
                                                         axis=0))
    # Create a label encoder object
    le = LabelEncoder()
    le_count = 0

    # Iterate through the columns
    for col in app_train:
        if app_train[col].dtype == 'object':
            print(col, ":", len(list(app_train[col].unique())))
            # If 2 or fewer unique categories
            if len(list(app_train[col].unique())) <= 2:
                # Train on the training data
                le.fit(app_train[col])
                # Transform both training and testing data
                app_train[col] = le.transform(app_train[col])
                app_test[col] = le.transform(app_test[col])

                # Keep track of how many columns were label encoded
                le_count += 1

    print('%d columns were label encoded.' % le_count)
    # one-hot encoding of categorical variables
    app_train = pd.get_dummies(app_train)
    app_test = pd.get_dummies(app_test)
    print('Training Features shape: ', app_train.shape)
    print('Testing Features shape: ', app_test.shape)
    # 特征对齐
    train_labels = app_train['TARGET']
    # Align the training and testing data, keep only columns present in both dataframes
    app_train, app_test = app_train.align(app_test, join='inner', axis=1)
    print('Training Features shape: ', app_train.shape)
    print('Testing Features shape: ', app_test.shape)
    print('Testing Features shape: ', app_test.shape)
    app_train['TARGET'] = train_labels

    #相关性分析
    # Find correlations with the target and sort
    correlations = app_train.corr()['TARGET']
    # Display correlations
    abscorrelations = abs(correlations)
    abscorrelations.plot()
    plt.ylabel('correlations')
    plt.show()
    abscorrelations = abscorrelations.sort_values()
    print('Most Positive Correlations: \n', correlations.tail(15))

    # 数据填充与归一化
    from sklearn.preprocessing import MinMaxScaler, Imputer
    # Drop the target from the training data
    if 'TARGET' in app_train:
        train = app_train.drop(['TARGET'], axis=1).copy()
    else:
        train = app_train.copy()
    features = list(train.columns)

    # Copy of the testing data
    test = app_test.copy()

    # Median imputation of missing values
    imputer = Imputer(strategy='median')

    # Scale each feature to 0-1
    scaler = MinMaxScaler(feature_range=(0, 1))

    # Fit on the training data
    imputer.fit(train)

    # Transform both training and testing data
    train = imputer.transform(train)
    test = imputer.transform(app_test)

    # Repeat with the scaler
    scaler.fit(train)
    train = scaler.transform(train)
    test = scaler.transform(test)

    print('Training data shape: ', train.shape)
    print('Testing data shape: ', test.shape)

    # LR
    from sklearn.linear_model import LogisticRegression

    # Make the model with the specified regularization parameter
    log_reg = LogisticRegression(penalty='l2',
                                 C=0.0001,
                                 class_weight='balanced',
                                 max_iter=500,
                                 solver='sag',
                                 verbose=1,
                                 n_jobs=-1)

    # Train on the training data

    log_reg.fit(train, train_labels)

    # predict 返回两列,第一列为0的概率,第二列为1的概率
    log_reg_pred = log_reg.predict_proba(test)[:, 1]
    submit = app_test[['SK_ID_CURR']]
    submit['TARGET'] = log_reg_pred

    # Save the submission to a csv file
    submit.to_csv('log_reg_baseline.csv', index=False)
    print('save log_reg_baseline.csv to file!')
示例#31
0
def preprocessing_features(df_train, df_test, process_continuous):
    to_delete_features = ['default','pdays']
    continuous_features = ['age', 'campaign', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
    categorical_ordered_features = ['education', 'housing', 'loan', 'contact','month', 'day_of_week','poutcome']
    categorical_unordered_features = ['job', 'marital']
    
    unknown_present_features = ['job','marital','education','housing','loan']
    
    ### Delete Features
    for feat in to_delete_features:
        print "\n--------- deleting feature --------- ",feat
        del df_train[feat]
        del df_test[feat]
    
    ### Fill unknowns in the features with feature-mode
    for feat in unknown_present_features:
        print "\n--------- Replacing unknowns in feature --------- ",feat
        feature_value_counts = df_train[feat].value_counts()
        print "Replaced with: ",feature_value_counts.idxmax()
        df_train.loc[df_train[feat] == "unknown",feat] = feature_value_counts.idxmax()
        df_test.loc[df_test[feat] == "unknown",feat] = feature_value_counts.idxmax()
    
    
    ### Label Categorical Ordered Features
    label_dict = {'education':{'illiterate':0, 'basic.4y':4, 'basic.6y':6, 'basic.9y':9, 'high.school':11, 'professional.course':13, 'university.degree':14},
                  'housing':{'no':0,'yes':1},
                  'loan':{'no':0,'yes':1},
                  'contact':{'telephone':0,'cellular':1},
                  'month':{'jan':1,'feb':2,'mar':3,'apr':4,'may':5,'jun':6,'jul':7,'aug':8,'sep':9,'oct':10,'nov':11,'dec':12},
                  'day_of_week':{'mon':1,'tue':2,'wed':3,'thu':4,'fri':5,'sat':6,'sun':7},
                  'poutcome':{'nonexistent':0,'failure':1,'success':2}}
    
    for feat in categorical_ordered_features:
        print "\n--------- Labelling feature --------- ",feat
        df_train = df_train.replace({feat:label_dict[feat]})
        df_test = df_test.replace({feat:label_dict[feat]})
        print "Labelled as: ",label_dict[feat]

        
    ### One hot encoding Categorical Un-ordered Features  
    for feat in categorical_unordered_features:
        print "\n--------- One Hot Encoding feature --------- ",feat
        label_encoder = LabelEncoder()
        label_encoder.fit(df_train[feat])
        df_train[feat] = label_encoder.transform(df_train[feat])
        df_test[feat] = label_encoder.transform(df_test[feat])

    one_hot_encoder = OneHotEncoder(sparse=False)
    one_hot_encoder.fit(df_train[categorical_unordered_features])
    one_hot_encoded_array_train = one_hot_encoder.transform(df_train[categorical_unordered_features])
    one_hot_encoded_df_train = pd.DataFrame(one_hot_encoded_array_train, index=df_train.index)
    one_hot_encoded_array_test = one_hot_encoder.transform(df_test[categorical_unordered_features])
    one_hot_encoded_df_test = pd.DataFrame(one_hot_encoded_array_test, index=df_test.index)
        
    df_train = pd.concat([df_train,one_hot_encoded_df_train], axis=1) #concatenate old columns with new one hot encoded columns
    df_test = pd.concat([df_test,one_hot_encoded_df_test], axis=1) #concatenate old columns with new one hot encoded columns

    df_train = df_train.drop(categorical_unordered_features, axis=1) #Delete columns which were one hot encoded
    df_test = df_test.drop(categorical_unordered_features, axis=1) #Delete columns which were one hot encoded

    ### Normalization or Standardization of Continuous Features
    if process_continuous == "Standardize":
        print "\n--------- Standardizing Continuous Features (Mean=0, Standard Deviation=1) --------- "
        standardization = StandardScaler()
        standardization.fit(df_train[continuous_features])
        df_train[continuous_features] = standardization.transform(df_train[continuous_features])
        df_test[continuous_features] = standardization.transform(df_test[continuous_features])

    elif process_continuous == "Normalize":
        print "\n--------- Normalizing Continuous Features (Min=0, Max=1) --------- "
        min_max_scaling = MinMaxScaler()
        min_max_scaling.fit(df_train[continuous_features])
        df_train[continuous_features] = min_max_scaling.transform(df_train[continuous_features])
        df_test[continuous_features] = min_max_scaling.transform(df_test[continuous_features])

    ### Return pre-processed df
    return df_train, df_test
示例#32
0
    if using_difference == True:
        Diff = '_Diff'
    if using_difference == True:
        # using dataset_diference for training
        dataset = dataset_difference
    else:
        # using dataset for training
        dataset = ts_values_array

    # split into train and test sets
    train_size = int(len(dataset) * 0.8)
    print('train_size: %i' % train_size)

    datset = atleast_2d(dataset).T
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaler = scaler.fit(datset)
    dataset_scaled = scaler.fit_transform(datset)

    train, test = dataset_scaled[0:train_size, :], dataset_scaled[
        train_size:, :]
    # data shape should be (lens_ts, n_features)
    train_input = train[:-1, :]

    train_target = train[1:, :]

    test_input = test[:-1, :]

    test_target = test[1:, :]

    model_esn = SimpleESN(n_readout=1000,
                          n_components=1000,
    print('#================================================#')
    print(' Training Datasize: '+str(X_train.shape[0])+' and test datasize: ' + str(X_test.shape[0]) + '.  ')
    print('#================================================#')
    
    # # Set First Run to Off
    First_run = False


# #### Pre-Process Data

# In[10]:


# Initialize Scaler
scaler = MinMaxScaler()
scaler.fit(X_train)

# Train Scaler
X_train_scaled = scaler.transform(X_train)
# Map to Test Set
X_test_scaled = scaler.transform(X_test)


# ### Visualize Data

# #### Train

# In[11]:


if is_visuallty_verbose:
示例#34
0
    plt.xlabel("eso")
    plt.ylabel("poi")
    #plt.show()
else:
    print(
        "outlierCleaner() is returning an empty list, no refitting to be done")

### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
# create np arrays for the rescaler
salary = np.nan_to_num(np.reshape((np.array(Salary)), (len(Salary), 1)))
bonus = np.nan_to_num(np.reshape((np.array(Bonus)), (len(Bonus), 1)))
eso = np.nan_to_num(np.reshape((np.array(ESO)), (len(ESO), 1)))
#rescale salary, bonus and exercised sti=ock options
scaler = MinMaxScaler()
scaled = scaler.fit(salary)
scaled_salary = scaled.transform(salary)
scaled = scaler.fit(bonus)
scaled_bonus = scaled.transform(bonus)
scaled = scaler.fit(eso)
scaled_eso = scaled.transform(eso)
#append data_dict with rescaled values and create new
#value for the percent of emails received from POI
count = 0
for name in names:
    data_dict[name]['salary'] = scaled_salary[count][0]
    data_dict[name]['bonus'] = scaled_bonus[count][0]
    data_dict[name]['exercised_stock_options'] = scaled_eso[count][0]
    count = count + 1
    data_dict[name]['Percent_Emails_from_POI'] = float(
        data_dict[name]['from_poi_to_this_person']) / float(
def minmax_scaling(df):
    scale = MinMaxScaler()
    scale.fit(df)
    df = scale.fit_transform(df)
    return df
class MNIST:
    def __init__(self):
        # Load the dataset
        (self.x_train, self.y_train), (self.x_test, self.y_test) = mnist.load_data()
        self.x_train_ = None
        self.x_val = None
        self.y_train_ = None
        self.y_val = None
        # Convert to float32
        self.x_train = self.x_train.astype(np.float32)  
        self.y_train = self.y_train.astype(np.float32)  
        self.x_test = self.x_test.astype(np.float32)
        self.y_test = self.y_test.astype(np.float32)
        # Reshape the x-Data to shape (num_examples, width, height, depth)
        self.x_train = np.expand_dims(self.x_train, axis=-1)  # 1 dim mehr für die depth info
        self.x_test = np.expand_dims(self.x_test, axis=-1)    
        # Save important data attributes as variables
        self.train_size = self.x_train.shape[0]
        self.test_size = self.x_test.shape[0]
        self.val_size = 0
        self.width = self.x_train.shape[1]
        self.height = self.x_train.shape[2]
        self.depth = self.x_train.shape[3]
        self.num_classes = 10                                 # np.max(self.y_train) +1
        # Reshape the y-Data to One-Hot encoding
        self.y_train = to_categorical(self.y_train, num_classes=self.num_classes)
        self.y_test = to_categorical(self.y_test, num_classes=self.num_classes)
    
    def get_train_set(self):
        return self.x_train, self.y_train
    
    def get_test_set(self):
        return self.x_test, self.y_test
    
    def get_splitted_train_validation_set(self):
        # train = 60.000
        # train_: 40.200, val: 19.800
        self.x_train_, self.x_val, self.y_train_, self.y_val = train_test_split(self.x_train, self.y_train, test_size=0.33)
        self.val_size = self.x_val.shape[0]
        self.train_splitted_size = self.x_train_.shape[0]
        return self.x_train_, self.x_val, self.y_train_, self.y_val
    
    def data_augmentation(self, augment_size=5000):
        # Create an instance of the image data genrator class
        image_generator = ImageDataGenerator(
            rotation_range=10,         # 15 Grad +/- drehen
            zoom_range=0.05,           # 10% zoomen
            width_shift_range=0.05,    # schieben hor. 10%
            height_shift_range=0.05,   # schieben vert. 10%
            fill_mode='constant',
            cval=0.0)
        # fit the data generator
        image_generator.fit(self.x_train, augment=True)
        # Get random train images for the data augmentation
        rand_idxs = np.random.randint(self.train_size, size=augment_size)
        x_augmented = self.x_train[rand_idxs].copy()
        y_augmented = self.y_train[rand_idxs].copy()
        x_augmented = image_generator.flow(x_augmented, np.zeros(augment_size), 
                                            batch_size = augment_size, shuffle=False).next()[0] # next() gibt eine Liste zurück und die daten sind an stelle 0
        # Append the augmented images to the train set
        self.x_train = np.concatenate((self.x_train, x_augmented))
        self.y_train = np.concatenate((self.y_train, y_augmented))
        self.train_size = self.x_train.shape[0]
    
    def data_preprocessing(self, preprocess_mode='standard'):
        # Preprcess the data
        if preprocess_mode == 'standard':
            self.scaler = StandardScaler()
        else:
            self.scaler = MinMaxScaler(feature_range=(0 ,1))
        self.scaler.fit(self.x_train.reshape(self.train_size, 784))
        self.x_train = self.scaler.transform(self.x_train.reshape(self.train_size, 784))
        self.x_test = self.scaler.transform(self.x_test.reshape(self.test_size, 784))
        self.x_train = self.x_train.reshape(self.train_size, self.width, self.height, self.depth)
        self.x_test = self.x_test.reshape(self.test_size, self.width, self.height, self.depth)
示例#37
0
clf = classifiers[1]
f.write("Random Forest:\n")
print(cross_val_score(clf,x_all,y_all,scoring='accuracy',cv=10))
accuracy=cross_val_score(clf,x_all,y_all,scoring='accuracy',cv=10).mean()*100
f.write("CV accuracy score = {0:.3f}\n".format(accuracy))

clf = classifiers[2]
f.write("Logistic Regression:\n")
print(cross_val_score(clf,x_all,y_all,scoring='accuracy',cv=10))
accuracy=cross_val_score(clf,x_all,y_all,scoring='accuracy',cv=10).mean()*100
f.write("CV accuracy score = {0:.3f}\n".format(accuracy))
#Split off test and train data and normalize data
x_trn, x_tst, y_trn, y_tst = train_test_split(x_all, y_all, test_size=0.4, random_state=42)
print(x_trn.shape)
scaler = MinMaxScaler()
scaler.fit(x_trn)
x_trn_n=scaler.transform(x_trn)
x_tst_n=scaler.transform(x_tst)
#Build the model and predict the parameters for Desicion Tree and calculate the weights of top10 features contributing to income greater than 50k on the screen
clf = classifiers[0]
model=clf.fit(x_trn_n,y_trn)
imp1=model.feature_importances_
var2imp1=dict(zip(list(df1),imp1))
var2imp1_sorted=pd.DataFrame(columns=['variable','weight'])
for key in sorted(var2imp1, key=lambda k:abs(var2imp1[k]),reverse=True):
    temp=pd.DataFrame([[key,var2imp1[key]]],columns=['variable','weight'])
    var2imp1_sorted=var2imp1_sorted.append(temp)
print("Top 10 important variables-Decision Tree:")
print(var2imp1_sorted[0:10])
f.write("Top 10 Weighted Variables - Decision Tree:"+"\n")
f.write("Rank\tVariable\tWeight\n")
示例#38
0
def data_preprocess(
    file_name: str,
    args: Dict,
    impute_method: str = "mode",
    scaling_method: str = "minmax",
):
    """Load the data and preprocess into 3d numpy array.
    Preprocessing includes:
    1. Remove outliers
    2. Extract sequence length for each patient id
    3. Impute missing data
    4. Normalize data
    5. Sort dataset according to sequence length

    Args:
    - file_name (str): CSV file name
    - args (dict): parameters for preprocessing data
    - impute_method (str): The imputation method ("median" or "mode")
    - scaling_method (str): The scaler method ("standard" or "minmax")

    Returns:
    - data: preprocessed data
    - time: ndarray of ints indicating the length for each data
    - params: the parameters to rescale the data
    """
    padding_value = args.padding_value
    if not isinstance(padding_value, float):
        raise ValueError("Must provide padding value with type `float`")

    #########################
    # Load data
    #########################

    index = 'Idx'
    label = 'Label'

    # Load .csv file, columns are typically as follows:
    # | Index | Time | Feature_1 | ... | Feature_n | Label |
    print("Loading data...\n")
    ori_data = pd.read_csv(file_name)

    # Remove spurious column, so that column 0 is now 'Index'.
    if ori_data.columns[0] == "Unnamed: 0":
        ori_data = ori_data.drop(["Unnamed: 0"], axis=1)

    #########################
    # Remove outliers from dataset
    #########################

    no = ori_data.shape[0]
    z_scores = stats.zscore(ori_data, axis=0, nan_policy='omit')
    z_filter = np.nanmax(np.abs(z_scores), axis=1) < 3
    ori_data = ori_data[z_filter]
    print(f"Dropped {no - ori_data.shape[0]} rows (outliers)\n")

    # Parameters
    uniq_id = np.unique(ori_data[index])
    no = len(uniq_id)
    dim = len(ori_data.columns) - 1  # Ignore index

    #########################
    # Impute, scale and pad data
    #########################

    # Initialize scaler
    if scaling_method == "minmax":
        scaler = MinMaxScaler()
        scaler.fit(ori_data)
        params = [scaler.data_min_, scaler.data_max_]

    elif scaling_method == "standard":
        scaler = StandardScaler()
        scaler.fit(ori_data)
        params = [scaler.mean_, scaler.var_]

    # Imputation values
    if impute_method == "median":
        impute_vals = ori_data.median()
    elif impute_method == "mode":
        impute_vals = stats.mode(ori_data).mode[0]
    else:
        raise ValueError("Imputation method should be `median` or `mode`")

    # TODO: Sanity check for padding value
    # if np.any(ori_data == padding_value):
    #     print(f"Padding value `{padding_value}` found in data")
    #     padding_value = np.nanmin(ori_data.to_numpy()) - 1
    #     print(f"Changed padding value to: {padding_value}\n")
    args.padding_value = padding_value

    # Output initialization
    output = np.empty([no, args.max_seq_len,
                       dim])  # Shape:[no, max_seq_len, dim]
    output.fill(args.padding_value)
    labels = np.empty([no, 1], dtype=np.int)
    time = np.empty([no], dtype=np.int)

    print("Preprocessing data...\n")
    # For each uniq id
    for i in tqdm(range(no)):
        # Extract the time-series data and label with a certain admissionid
        curr_data = ori_data[ori_data[index] == uniq_id[i]].to_numpy()
        curr_label = int(curr_data[0, -1])

        # Impute missing data
        curr_data = imputer(curr_data, impute_vals)

        # Normalize data
        curr_data = scaler.transform(curr_data)

        # Extract time and assign to the preprocessed data (Excluding ID)
        curr_no = len(curr_data)

        # Pad data to `max_seq_len`
        if curr_no >= args.max_seq_len:
            output[i, :, :] = curr_data[:args.max_seq_len,
                                        1:]  # Shape: [1, max_seq_len, dim]
            time[i] = args.max_seq_len
        else:
            output[
                i, :curr_no, :] = curr_data[:,
                                            1:]  # Shape: [1, max_seq_len, dim]
            time[i] = curr_no

        # Pad label for data
        labels[i] = curr_label

    return output, time, labels, params
# Initial values
ONE_BATCH_SIZE = 60
BATCH_SIZE = 10
EPOCHS = 30
TRAIN_TEST_SPLIT_POINT = 0.8
X_TRAIN = []
Y_TRAIN = []
X_TEST = []
Y_TEST = []

# Load Dataframe
data = load_standard_data_frame(get_hourly_last_3_months_data('BTC'))

# Scale data
sc = MinMaxScaler(feature_range=(0, 1))
sc.fit(data)
data = sc.transform(data)

# Split into training and test set
training_set, test_set = splitter(data, TRAIN_TEST_SPLIT_POINT)

# Prepare samples
X_TRAIN, X_TEST, Y_TRAIN, Y_TEST = prepare_samples(ONE_BATCH_SIZE,
                                                   training_set, test_set,
                                                   X_TRAIN, Y_TRAIN, X_TEST,
                                                   Y_TEST, True)

# Build LSTM model
model = Sequential()
model.add(
    LSTM(units=16, return_sequences=False, input_shape=(X_TRAIN.shape[1], 6)))
示例#40
0
y = np.loadtxt('D:/ser/Train/P_Train_27KB.txt')
y = np.array(y)
y = pd.DataFrame(y, columns=['id', 'P'])
s = y.P
id = y.drop('P', axis=1)
Y = np.array(s)
# y=np.loadtxt('D:/ser/Train/P_Train_27KB.txt')
# y=np.array(y)
# y=pd.DataFrame(y,columns=['id','P'])
# s=y.P
# Y=np.array(s)
# print X.shape
Y = Y.reshape(-1, 1)

le = MinMaxScaler(feature_range=(-3, 3))
le.fit(Y)
Y = le.transform(Y)
train_x, valid_x, train_y, valid_y = train_test_split(X,
                                                      Y,
                                                      test_size=0.2,
                                                      random_state=0)
svr_model = SVR(kernel='rbf', C=10.0, epsilon=0.01)
svr_model.fit(train_x, train_y)
pre_test = svr_model.predict(valid_x)
print "the result of 1582 dimensional features"
#metrics model
mse = metrics.mean_squared_error(pre_test, valid_y)
rmse = np.sqrt(mse)
r2 = metrics.r2_score(pre_test, valid_y)
import math
示例#41
0
shuffle_df = shuffle(df_sample)

df_train = shuffle_df[0:2400]
df_test = shuffle_df[2400:]

train_feature = np.array(df_train.values[:,0:29])
train_label = np.array(df_train.values[:,-1])
test_feature = np.array(df_test.values[:,0:29])
test_label = np.array(df_test.values[:,-1])

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scaler.fit(train_feature)
train_feature_trans = scaler.transform(train_feature)
test_feature_trans = scaler.transform(test_feature)

from keras.layers import Dense
from keras.layers import Dropout

import matplotlib.pyplot as plt 
def show_train_history(train_history,train,validation):
    plt.plot(train_history.history[train])
    plt.plot(train_history.history[validation])
    plt.title('Train History')
    plt.ylabel(train)
    plt.xlabel('Epoch')
    plt.legend(['train', 'validation'], loc='best')
    plt.show()
示例#42
0
class DnnModel(AbstractModel, ABC):
    def __init__(self, params):
        super().__init__(params)
        self.w2v_model_parameter = {
            'max_len': 150,
            'sg': 1,
            'hs': 1,
            'min_count': 0,
            'window': 1,
            'size': 5,
            'iter': 30,
            'workers': 8
        }
        self.w2v = Word2Vector(**self.w2v_model_parameter)
        self.epoch = params.epoch
        self.scaler = None
        self.regression = KerasRegression(encoding_dim=1)
        self.data = None

    def build_word2vector(self, data):
        self.data = list(data)
        if self.w2v.model:
            self.w2v.update(self.data)
        else:
            self.w2v.fit(self.data)

    def fit(self, data):
        self.build_word2vector(data)
        list_vec = []
        list_cost = []
        for sql, duration_time in self.data:
            if check_illegal_sql(sql):
                continue
            filter_template = templatize_sql(sql)
            vector = self.w2v.str2vec(filter_template)
            list_vec.append(vector)
            list_cost.append(duration_time)

        features = np.array(list_vec)
        labels = np.array(list_cost)

        labels = labels.reshape(-1, 1)
        self.scaler = MinMaxScaler(feature_range=(0, 1))
        self.scaler.fit(labels)
        labels = self.scaler.transform(labels)
        self.regression.fit(features, labels, epochs=self.epoch)

    def transform(self, data):

        feature_list = []
        data_backup = list(data)
        error_list = []
        for idx_error, sql in enumerate(data_backup):
            if check_illegal_sql(sql):
                error_list.append(idx_error)
                continue
            filter_template = templatize_sql(sql)
            vector = self.w2v.str2vec(filter_template)
            feature_list.append(vector)

        features = np.array(feature_list)
        predictions = self.regression.predict(features)
        predictions = np.abs(predictions)
        score = self.scaler.inverse_transform(predictions)
        if error_list:
            for item in error_list:
                score = np.insert(score, item, -1)
        score = np.hstack(
            (np.array(data_backup).reshape(-1, 1), score.reshape(-1,
                                                                 1))).tolist()
        return score

    def load(self, filepath):
        realpath = os.path.realpath(filepath)
        if os.path.exists(realpath):
            dnn_path = os.path.join(realpath, 'dnn_model.h5')
            word2vector_path = os.path.join(realpath, 'w2v.model')
            scaler_path = os.path.join(realpath, 'scaler.pkl')
            self.regression.load(dnn_path)
            self.w2v.load(word2vector_path)
            with open(scaler_path, 'rb') as f:
                self.scaler = pickle.load(f)
        else:
            logging.error("{} not exist.".format(realpath))

    def save(self, filepath):
        realpath = os.path.realpath(filepath)
        if not os.path.exists(realpath):
            os.makedirs(realpath, mode=0o700)
        if oct(os.stat(realpath).st_mode)[-3:] != '700':
            os.chmod(realpath, stat.S_IRWXU)
        dnn_path = os.path.join(realpath, 'dnn_model.h5')
        word2vector_path = os.path.join(realpath, 'w2v.model')
        scaler_path = os.path.join(realpath, 'scaler.pkl')
        self.regression.save(dnn_path)
        self.w2v.save(word2vector_path)
        with open(scaler_path, 'wb') as f:
            pickle.dump(self.scaler, f)
        print("DNN model is stored in '{}'".format(realpath))
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
%matplotlib inline

mm_s = MinMaxScaler()
iris = load_iris()

y = iris.target[(iris.target == 0)|(iris.target == 1)]
X = iris.data[(iris.target == 0)|(iris.target == 1)][:,[2,3]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state =1, stratify=y)

mm_s.fit(X)
X_train_std = mm_s.transform(X_train)
X_test_std = mm_s.transform(X_test)

legd = LogisticsRegressionGD(eta=0.01, n_iter=1000, random_state=1)
legd.fit(X_train_std, y_train)

plot_decision_regions(X_train_std, y_train, classifier=legd)
plt.ylabel("petal length [standardized]")
plt.ylabel("petal width [standardized]")
plt.legend(loc="upper left")
plt.tight_layout()
plt.show()


from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
# 导入数据预处理工具MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import matplotlib.pyplot as plt

# 生成样本数量为500 ,分类数为5的数据集
X, y = make_blobs(n_samples=500, centers=5, random_state=8)
# 将数据集拆分成训练集和训练集
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=8)

# 使用MinMaxScaler对数掘进行预处理,使数据全部为非负值
scaler = MinMaxScaler()
scaler.fit(X_train)
scaler.fit(X_test)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 用多项式朴素贝叶斯拟合数据
mnb = MultinomialNB()
mnb.fit(X_train_scaled, y_train)

# 限定横轴与纵轴的最大值
x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
# 用不同的背景色表示不同的分类
xx, yy = np.meshgrid(np.arange(x_min, x_max, .02),
                     np.arange(y_min, y_max, .02))
from sklearn.metrics import mean_squared_error, r2_score

import numpy as np

path = './data/'
name = 'boston'

x =np.load(path+name+'x.npy')
y =np.load(path+name+'y.npy')

print(x.shape) #506,13
print(y.shape) #506

#데이터 전처리 scaler 
scaler = MinMaxScaler()
scaler.fit(x)
x = scaler.transform(x)

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8)

print(x_train.shape, x_test.shape)#(404, 13) (102, 13)
print(y_train.shape, y_test.shape)#(404,) (102,)


#모델링


path = './save/boston/modelSave'
path2 = './save/boston/'

####1. loadmodel
示例#46
0
import matplotlib.pyplot as plot

from sklearn import datasets

#读取数据
housing = pd.read_csv('kc_train.csv')
target = pd.read_csv('kc_train2.csv')  #销售价格
t = pd.read_csv('kc_test.csv')  #测试数据

#数据预处理
housing.info()  #查看是否有缺失值

#特征缩放
from sklearn.preprocessing import MinMaxScaler
minmax_scaler = MinMaxScaler()
minmax_scaler.fit(housing)  #进行内部拟合,内部参数会发生变化
scaler_housing = minmax_scaler.transform(housing)
scaler_housing = pd.DataFrame(scaler_housing, columns=housing.columns)

mm = MinMaxScaler()
mm.fit(t)
scaler_t = mm.transform(t)
scaler_t = pd.DataFrame(scaler_t, columns=t.columns)

#选择基于梯度下降的线性回归模型
from sklearn.linear_model import LinearRegression
LR_reg = LinearRegression()
#进行拟合
LR_reg.fit(scaler_housing, target)

#使用均方误差用于评价模型好坏
import seaborn as sns 
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

#The goal is to predict energy consumption by appliances.

Data = pd.read_csv('Energy_data.csv')
#print(Data.head()) 
#print(Data.isnull().sum().sort_values(ascending = True)) #no null values
 
df = Data.drop(columns=['date', 'lights'])
print(df)

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(df)
scaled_feat = scaler.transform(df)
df_MinMaxSc = pd.DataFrame(data = scaled_feat, columns = df.columns)
features_df = df_MinMaxSc.drop(columns=['Appliances'])
target_variable= df_MinMaxSc['Appliances']


x = df.iloc[:, 3].values
y = df.iloc[:, 11].values

from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
#convert the data into 2 Dimensional array.

#regressor.fit(X_train,Y_train) #training our machine learning model using these data.
示例#48
0
train_data, cv_data = train_test_split(selected_data,
                                       test_size=0.3,
                                       random_state=42)

train_x = train_data.drop('Churn Status', axis=1)
train_y = train_data['Churn Status']

cv_x = cv_data.drop('Churn Status', axis=1)
cv_y = cv_data['Churn Status']

train_x.drop(1400, inplace=True)
train_y.drop(1400, inplace=True)

from sklearn.preprocessing import MinMaxScaler
std_scaler = MinMaxScaler()
std_scaler.fit(train_x)

#\train_x_std = std_scaler.transform(train_x)
#cv_x_std = std_scaler.transform(cv_x)

train_x_std = train_x
cv_x_std = cv_x

lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
lda.fit(train_x_std, train_y)

train_preds_lda = lda.predict(train_x_std)
cv_preds_lda = lda.predict(cv_x_std)

train_acc_lda = accuracy_score(train_preds_lda, train_y)
cv_acc_lda = accuracy_score(cv_preds_lda, cv_y)
示例#49
0
class DealingWithData:
    def __init__(self, df_data):
        self.df_data = df_data
        pass

    def count_na(self):
        return dict(NANs=self.df_data.isna().sum())

    def fillna(self):
        """
        2 of the colums have equal number of nan's
        # TODO : find corrolation
        """
        self.df_data.fillna(value=0, inplace=True)
        pass

    def correct_dates(self):
        self.df_data["time"] = pd.to_datetime(self.df_data["time"], utc=True)
        pass

    def drop_objects_col(self):
        for col in self.df_data.columns:
            if self.df_data[col].dtype == str("object"):
                # print(typ)
                self.df_data = self.df_data.drop(typ, axis=1)

    """
    visualization should have its own class.
    # TODO : decouple.
    """

    def plot_dist(self, x_plot="time", y_plot="generation biomass"):
        self.__viz.plot_distribution(self, x_plot=x_plot, y_plot=y_plot)
        pass

    def plot_dist_by_idx(self, idx_x=0, idx_y=1):
        # same as
        x_plot = self.df_data.columns[idx_x]
        y_plot = self.df_data.columns[idx_y]
        return self.__viz.plot_distribution(self, x_plot=x_plot, y_plot=y_plot)
        pass

    def scale_and_split_Xy(self, y_cols_idx: list, test_size=0.25) -> dict:
        # Extract.
        X = self.df_data.copy().drop("time", axis=1)
        self.y_cols = [X.columns[idx] for idx in y_cols_idx]
        y = X[self.y_cols]
        X = X.drop(self.y_cols, axis="columns")
        self.x_cols = X.columns
        print(
            f"""
        X data columns: {self.x_cols}\n
        y data colunms: {self.y_cols}\n
        """
        )
        from sklearn.preprocessing import MinMaxScaler

        # Scale data
        self.x_scaler = MinMaxScaler()
        self.x_scaler.fit(X)

        X = self.x_scaler.transform(X)

        self.y_scaler = MinMaxScaler()
        self.y_scaler.fit(y)
        y = self.y_scaler.transform(y)

        # split data
        from sklearn.model_selection import train_test_split

        x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

        return [x_train, y_train, x_test, y_test]

    class __viz:
        def plot_distribution(self, x_plot, y_plot):
            df_plot = dict(
                x=[k for k in range(self.df_data[x_plot].count())],
                y=self.df_data[y_plot],
            )
            fig = px.scatter(
                df_plot,
                x="x",
                y="y",
                color="y",
                marginal_y="violin",
                marginal_x="box",
                trendline="ols",
                template="simple_white",
            )

            return fig
            # fig.show()

        def plt_model_scater(self, x, y, y_list=[], params={}):
            plt.figure()
            plt.scatter(x, y, color="red")
            for y in y_list:
                plt.plot(x, y, color="blue")
            # plt.title("Generation Fossil Hard Coal vs Price Day Ahead")
            # plt.xlabel("Generation Fossil Hard Coal")
            # plt.ylabel("Price Day Ahead")
            return plt
示例#50
0
continuous_features = [
    'Fresh', 'Milk', 'Grocery', 'Frozen', 'Detergents_Paper', 'Delicassen'
]

data[continuous_features].describe()

#convert categotrial vaiables to binary
for col in categorical_features:
    dummies = pd.get_dummies(data[col], prefix=col)
    data = pd.concat([data, dummies], axis=1)
    data.drop(col, axis=1, inplace=True)
data.head()

#scale features to give equal importance to each
mms = MinMaxScaler()
mms.fit(data)
data_transformed = mms.transform(data)

Sum_of_squared_distances = []
K = range(1, 15)
for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(data_transformed)
    Sum_of_squared_distances.append(km.inertia_)

plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()
示例#51
0
# load feature and labels
feat = np.load('/home/s1820002/atsit/data/feat_34_hfs.npy')
vad = np.load('/home/s1820002/IEMOCAP-Emotion-Detection/y_egemaps.npy')

# remove outlier, < 1, > 5
vad = np.where(vad == 5.5, 5.0, vad)
vad = np.where(vad == 0.5, 1.0, vad)

scaled_vad = True
scaled_feature = False

# standardization
if scaled_vad:
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler = scaler.fit(
        vad)  #.reshape(vad.shape[0]*vad.shape[1], vad.shape[2]))
    scaled_vad = scaler.transform(
        vad)  #.reshape(vad.shape[0]*vad.shape[1], vad.shape[2]))
    vad = scaled_vad
else:
    vad = vad

if scaled_feature == True:
    scaler = StandardScaler()
    scaler = scaler.fit(
        feat.reshape(feat.shape[0] * feat.shape[1], feat.shape[2]))
    scaled_feat = scaler.transform(
        feat.reshape(feat.shape[0] * feat.shape[1], feat.shape[2]))
    scaled_feat = scaled_feat.reshape(feat.shape[0], feat.shape[1],
                                      feat.shape[2])
    feat = scaled_feat
示例#52
0
# model.add(LeakyReLU())
model.add(Dropout(0.5))
model.add(Conv1D( strides=2, filters=nb_features, kernel_size=2))
'''
model.load_weights(
    'weights/bitcoin2015to2017_close_CNN_2_relu-44-0.00023.hdf5')
model.compile(loss='mse', optimizer='adam')

# In[336]:

predicted = model.predict(validation_datas)
predicted_inverted = []

# In[7]:
for i in range(original_datas.shape[1]):
    scaler.fit(original_datas[:, i].reshape(-1, 1))
    predicted_inverted.append(scaler.inverse_transform(predicted[:, :, i]))
print(np.array(predicted_inverted).shape)
#get only the close data
ground_true = ground_true[:, :, 0].reshape(-1)
ground_true_times = ground_true_times.reshape(-1)
ground_true_times = pd.to_datetime(ground_true_times, unit='s')
# since we are appending in the first dimension
predicted_inverted = np.array(predicted_inverted)[0, :, :].reshape(-1)
print(np.array(predicted_inverted).shape)
validation_output_times = pd.to_datetime(validation_output_times.reshape(-1),
                                         unit='s')

# In[337]:

ground_true_df = pd.DataFrame()
示例#53
0
import numpy as np
from keras.models import load_model
from sklearn.preprocessing import MinMaxScaler

from datagen import generate_data,normalize

min_max_scalar = MinMaxScaler()

data = generate_data(3000)

np.savetxt('3k.csv',data,delimiter=',')

min_max_scalar.fit(data)

testcase = np.array([
		[4,0,0.8,2,500],
		[0.1,0,0.8,2,500],
		[4,0,0.8,2,250],
		[4,0,0.8,2,550],

	])

# temp = min_max_scalar.transform(testcase).reshape(-1, 1)


norm_testdata = normalize(testcase,3000)
norm_data 	  = normalize(data,3000)
model = load_model('94.h5')

print(np.sum(abs(model.predict(norm_testdata) - norm_testdata),axis=1))
# print(np.mean(np.sum(abs(model.predict(norm_data) - norm_data),axis=1)))
示例#54
0
fun = np.load("f_final.npy")
vel = np.load("u_final.npy")
velBC = vel.copy()
velBC[:, :, :, 1:] = 0
# BC inputs to keep velocities zero and 1 at boundaries
velBCx = velBC[:, 0, :, :].reshape(velBC.shape[0], 1, velBC.shape[-2],
                                   velBC.shape[-1]) / np.max(velBC)
velBCy = velBC[:, 1, :, :].reshape(velBC.shape[0], 1, velBC.shape[-2],
                                   velBC.shape[-1]) / np.max(velBC)

re_scaler = MinMaxScaler(feature_range=(0.2, 0.7))
feq_scaler = MinMaxScaler(feature_range=(0.2, 0.7))
vel_scaler = MinMaxScaler(feature_range=(0.2, 0.7))

Re_scaled = re_scaler.fit_transform(Re.reshape(Re.shape[0], 1))
feq_scaler.fit(feq.ravel())
vel_scaler.fit(vel.ravel())

num = vel.shape[0]
print("Shape of inputs : ")
print("Re : " + str(Re.shape))
print("feq : " + str(feq.shape))
print("fun : " + str(fun.shape))
print("vel : " + str(vel.shape))

print("Number of training samples is " + str(num))
print("Original resolution of input/output is " + str(vel.shape[2:]))

#decreasing the resolution by half to make it easier to train
# first removing middle rows and columns
# feq = np.delete(feq,int(num/2),axis=1);feq = np.delete(feq,int(num/2),axis=2)
示例#55
0
                       interval=inter,
                       auto_adjust=True,
                       prepost=True,
                       threads=True,
                       proxy=None)
    return data[["Open", "High", "Close"]]


df = ts_download_btc("3d")
df.fillna(method="ffill", inplace=True)
df.dropna(inplace=True)
print(df)
cl = df
train = cl[0:int(len(cl) * 0.80)]
scl = MinMaxScaler()
scl.fit(train.values.reshape(-1, 1))
cl = scl.transform(cl.values.reshape(-1, 1))


def processData(data, lb):
    X, Y = [], []
    for i in range(len(data) - lb - 1):
        X.append(data[i:(i + lb), 0])
        Y.append(data[(i + lb), 0])
    return np.array(X), np.array(Y)


lb = 10
X, y = processData(cl, lb)
X_train, X_test = X[:int(X.shape[0] * 0.90)], X[int(X.shape[0] * 0.90):]
y_train, y_test = y[:int(y.shape[0] * 0.90)], y[int(y.shape[0] * 0.90):]
示例#56
0
def plot_gender_development_over_time(no_terms_or_topics_to_show=8,
                                      data='topics',
                                      display_selector='most_frequent',
                                      selected_terms_or_topics=None,
                                      show_plot=True,
                                      store_to_filename=None,
                                      title=None):
    """

    :param no_terms_or_topics_to_show: int
    :param data: 'topics', 'terms', 'terms_of_topics'
    :param display_selector: 'most_frequent', 'most_divergent', 'most_variable'
    :param selected_terms_or_topics: topic_id or list of terms
    :param show_plot: bool
    :param store_to_filename: bool or str
    :return:
    """

    if data == 'terms_of_topic':
        if not isinstance(selected_terms_or_topics, int):
            raise ValueError(
                "When displaying 'terms_of_topic', please pass a topic_id for param"
                "selected_terms_or_topics")

    # 0: find terms or topics to display
    d = Dataset()
    if data == 'topics':
        selected_terms_or_topics = [f'topic.{id}' for id in range(1, 71)]
        title_name = 'topics'
    elif data == 'terms':
        vocab = []
        for t in selected_terms_or_topics:
            vocab.append(t)
        d.get_document_term_matrix(vocabulary=vocab, store_in_df=True)
        title_name = 'terms'
    elif data == 'terms_of_topic':
        vocab = []
        topic_id = selected_terms_or_topics
        for term in TOPICS[topic_id]['terms_prob']:
            if term in d.vocabulary:
                vocab.append(term)
        selected_terms_or_topics = vocab
        d.get_document_term_matrix(vocabulary=vocab, store_in_df=True)
        title_name = f'terms of topic {topic_id}'
    else:
        raise ValueError(
            '"data" has to be "terms" "topics" or "terms_of_topic"')

    if not title:
        if display_selector == 'most_frequent':
            title = f'Most frequent {title_name} for female (top) and male authors (bottom)'
        elif display_selector == 'most_divergent':
            title = f'Most divergent {title_name} for female (top) and male authors (bottom)'
        else:
            title = f'Most variable {title_name} for female (top) and male authors (bottom)'

    df = d.df

    # 1: Load data
    data = {}
    for t in selected_terms_or_topics:
        data[t] = defaultdict(list)
    min_freq_total = 1
    max_freq_total = 0

    for idx, year in enumerate(range(1982, 2013)):
        time_slice = df[(df.ThesisYear >= year - 2)
                        & (df.ThesisYear <= year + 2)]
        time_slice_female = time_slice[time_slice.AdviseeGender == 'female']
        time_slice_male = time_slice[time_slice.AdviseeGender == 'male']

        for t in selected_terms_or_topics:
            freq_total = time_slice[t].mean()
            freq_female = time_slice_female[t].mean()
            freq_male = time_slice_male[t].mean()

            #            if t == 'gender' and year == 2008:
            #                embed()

            # if a term doesn't appear, it is neutral
            if (freq_male + freq_female) == 0:
                freq_score = 0.5
            else:
                freq_score = freq_female / (freq_female + freq_male)

            data[t]['year'].append(year)
            data[t]['freq_score'].append(freq_score)
            data[t]['freq_total'].append(freq_total)

            if freq_total < min_freq_total:
                min_freq_total = freq_total
            if freq_total > max_freq_total:
                max_freq_total = freq_total

            data[t]['mean_freq_score'] = np.mean(data[t]['freq_score'])
            data[t]['mean_freq_total'] = np.mean(data[t]['freq_total'])
            data[t]['freq_score_range'] = max(data[t]['freq_score']) - min(
                data[t]['freq_score'])

    # 2: Set up plot
    fig = plt.figure(figsize=(12, 12))
    gs = gridspec.GridSpec(nrows=1,
                           ncols=1,
                           figure=fig,
                           width_ratios=[1],
                           height_ratios=[1],
                           wspace=0.2,
                           hspace=0.05)

    ax = fig.add_subplot(gs[0, 0])
    ax.set_ylim(0, 1)
    ax.set_xlim(1985, 2010)
    ax.set_axisbelow(True)
    ax.grid(which='major', axis='both')

    dot_scaler = MinMaxScaler((0.0, 50.0))
    dot_scaler.fit(np.array([min_freq_total, max_freq_total]).reshape(-1, 1))
    legends = []

    def draw_line(t, t_data, df):
        """
        Draws one line depending on t (term or topic string) and t_data (dict of data belonging
        to t)

        :param t: str
        :param t_data: dict
        :return:
        """
        y = t_data['freq_score']
        x = t_data['year']
        frequencies = t_data['freq_total']
        if t.startswith('topic.'):
            legend = TOPICS[int(t[6:])]['name']
        else:
            legend = '{:10s} ({})'.format(t, df[t].sum())

        x_spline = np.linspace(min(x), max(x), (2010 - 1985 + 1) * 1000)
        spl = make_interp_spline(x, y, k=1)  # BSpline object
        y_spline = spl(x_spline)

        line_interpolater = interp1d(x, frequencies)
        line_widths = line_interpolater(x_spline)
        line_widths = dot_scaler.transform(line_widths.reshape(-1,
                                                               1)).flatten()

        try:
            color = sns.color_palette()[len(legends)]
        except IndexError:
            color = sns.cubehelix_palette(100,
                                          start=2,
                                          rot=0,
                                          dark=0,
                                          light=.95)[len(legends)]

        ax.scatter(x_spline,
                   y_spline,
                   s=line_widths,
                   antialiased=True,
                   color=color)
        legends.append(mpatches.Patch(color=color, label=legend))

    # 3: Plot
    if display_selector == 'most_frequent':
        ax.set_title(title, weight='bold', fontsize=18)
        sorted_items = sorted(data.items(),
                              key=lambda k_v: k_v[1]['mean_freq_total'],
                              reverse=True)
        for t, t_data in sorted_items[:no_terms_or_topics_to_show]:
            draw_line(t, t_data, df)
    elif display_selector == 'most_divergent':
        ax.set_title(title, weight='bold', fontsize=18)
        sorted_items = sorted(data.items(),
                              key=lambda k_v: k_v[1]['mean_freq_score'],
                              reverse=True)
        no_disp = no_terms_or_topics_to_show // 2
        for t, t_data in sorted_items[:no_disp] + sorted_items[::-1][:no_disp]:
            draw_line(t, t_data, df)
    elif display_selector == 'most_variable':
        ax.set_title(title, weight='bold', fontsize=18)
        # sort by mean_freq_range second to preserve colors between plots
        sorted_items = sorted(data.items(),
                              key=lambda k_v: k_v[1]['freq_score_range'],
                              reverse=True)
        sorted_items = sorted_items[:no_terms_or_topics_to_show]
        sorted_items = sorted(sorted_items,
                              key=lambda k_v: k_v[1]['mean_freq_score'],
                              reverse=True)
        for t, t_data in sorted_items:
            draw_line(t, t_data, df)

    else:
        raise ValueError(
            'display_selector has to be most_frequent, most_variable, or most_divergent'
        )

    ax.legend(handles=legends, loc=4)

    if show_plot:
        plt.show()
    if store_to_filename:
        fig.savefig(Path('data', store_to_filename))
示例#57
-1
class NumericColumn(BaseEstimator, TransformerMixin):
    '''
    Take a numeric value column and standardize it.
    '''

    def __init__(self):
        '''
        Set up the internal transformation.
        '''
        self._transformer = MinMaxScaler()

    def fit(self, X, y=None):
        '''
        Fit the standardization.
        '''
        zeroed = pd.DataFrame(np.array(X).reshape(-1, 1)).fillna(0)
        self._transformer.fit(zeroed)
        return self

    def transform(self, X):
        '''
        Transform a column of data into numerical percentage values.

        Parameters
        ----------
        X : pandas series or numpy array
        '''
        zeroed = pd.DataFrame(np.array(X).reshape(-1, 1)).fillna(0)
        return self._transformer.transform(zeroed).astype(np.float32)
示例#58
-1
def NB_coefficients(year=2010):
    poi_dist = getFourSquarePOIDistribution(useRatio=False)
    F_taxi = getTaxiFlow(normalization="bydestination")
    W2 = generate_geographical_SpatialLag_ca()
    Y = retrieve_crime_count(year=year)
    C = generate_corina_features()
    D = C[1]

    popul = C[1][:,0].reshape(C[1].shape[0],1)
    Y = np.divide(Y, popul) * 10000
    
    f2 = np.dot(W2, Y)
    ftaxi = np.dot(F_taxi, Y)
    
    f = np.concatenate( (D, f2, ftaxi, poi_dist), axis=1 )
    mms = MinMaxScaler(copy=False)
    mms.fit(f)
    mms.transform(f)
    header = C[0] + [ 'spatiallag', 'taxiflow'] + \
        ['POI food', 'POI residence', 'POI travel', 'POI arts entertainment', 
                       'POI outdoors recreation', 'POI education', 'POI nightlife', 
                       'POI professional', 'POI shops', 'POI event']
    df = pd.DataFrame(f, columns=header)
    
    np.savetxt("Y.csv", Y, delimiter=",")
    df.to_csv("f.csv", sep=",", index=False)
    
    # NB permute
    nbres = subprocess.check_output( ['Rscript', 'nbr_eval.R', 'ca', 'coefficient'] )
    print nbres
    
    ls = nbres.strip().split(" ")
    coef = [float(e) for e in ls]
    print coef
    return coef, header
示例#59
-1
 def predict_new(self, input):
     model = self.train_model()
     assert len(input) == 5 and type(input) == list
     scaler = MinMaxScaler(feature_range=(0, 1))
     scaler.fit(self.data)
     inp = scaler.transform([input])
     print(scaler.inverse_transform(model.predict(numpy.array(inp).reshape(1, 1, 5))))
示例#60
-1
    def _scaled_data(self):
        """Load scaled data.

        Args:
            None

        Returns:
            (scaler, train, test): Tuple of list of train and test data

        """
        # Initialize key variables
        (_train, _test) = self._data()

        # Fit scaler
        scaler = MinMaxScaler(feature_range=(-1, 1))
        scaler = scaler.fit(_train)

        # Transform train
        train = _train.reshape(_train.shape[0], _train.shape[1])
        train_scaled = scaler.transform(train)

        # Transform test
        test = _test.reshape(_test.shape[0], _test.shape[1])
        test_scaled = scaler.transform(test)

        # Return
        return scaler, train_scaled, test_scaled