Пример #1
0
def gbdt_lr(para):
    print("gbdt_lr")
    x_train = para[0]
    x_train_lr = para[1]
    x_test = para[2]
    y_train = para[3]
    y_train_lr = para[4]
    y_test = para[5]
    maxleafnodes = 11
    gbc = GBDT(max_leaf_nodes=maxleafnodes - 1,
               n_estimators=600,
               min_samples_leaf=5,
               max_depth=3,
               learning_rate=0.02,
               subsample=0.2,
               max_features=0.1)
    gbc.fit(x_train, y_train)
    ohe = OHE()
    ohe.fit(gbc.apply(x_train)[:, :])
    li = gbc.apply(x_train_lr)[:, :]
    x_train_lr_gbc = ohe.transform(li)
    #x_train_lr_gbc=myTransform(li,max_leaf_nodes=maxleafnodes)
    li = gbc.apply(x_test)[:, :]
    x_test_gbc = ohe.transform(li)
    #x_test_gbc=myTransform(li,max_leaf_nodes=maxleafnodes)
    del (li)
    lr = sgd(n_iter=50)
    lr.fit(x_train_lr_gbc, y_train_lr)
    yp = lr.predict(x_test_gbc)
    print("GBDT+SGD: " + str(auc(y_test, yp)))
    return (gbc, yp)
Пример #2
0
def oneHotEncoding():
    print("-----------Try to ONE HOT ENCODING-----------------")
    setToCompare = 'abcdefghijklmnopqrstuvwxyz '
    ctoi = dict((c, i) for i, c in enumerate(setToCompare))
    itoc = dict((i, c) for i, c in enumerate(setToCompare))
    # integer encode input data
    integer_encoded = [ctoi[char] for char in musicdata]
    print(integer_encoded)
    # one hot encode
    onehot = list()
    for value in integer_encoded:
        letter = [0 for _ in range(len(setToCompare))]
        letter[value] = 1
        onehot.append(letter)
    print(onehot)
    # invert encoding
    inverted = itoc[np.argmax(onehot[0])]
    print(inverted)

    print(
        "--------------------------ENCODING IN PROGRESS----------------------")
    labelencoder = LE()
    X1 = X
    Y1 = Y
    X1[:, 0] = labelencoder.fit_transform(X1[:, 0])
    onehotencoder = OHE([0])
    X1 = onehotencoder.fit_transform(X1)

    labelencoderY = LE()
    Y1 = labelencoderY.fit_transform(Y1)
    print(X1)
    print(Y1)
Пример #3
0
def graph_to_node_label(graphs, labels):
    targets = np.array(
        list(
            itertools.chain(*[[labels[i]] * graphs[i].number_of_nodes()
                              for i in range(len(graphs))])))
    enc = OHE(dtype=np.float32)
    return np.asarray(enc.fit_transform(targets.reshape(-1, 1)).todense())
Пример #4
0
    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEncoder:

        self.check_requirements(X, y)

        self.preprocessor['categorical'] = OHE(categories=X['categories'],
                                               sparse=False,
                                               handle_unknown='error')
        return self
Пример #5
0
    def fit(self, X, y):
        """Build a neural network classifier from the training set (X, y).

        
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.

        y : array-like, shape = [n_samples] or [n_samples, n_classes] (one-hot), 
            The target values (class labels) as integers.
        
        Returns
        -------
        self : object
            Returns the instance itself.
        """  
        X = check_array(X, ensure_min_samples=2, ensure_min_features=2)
        y = check_array(y, ensure_2d=False)
        if np.isnan(np.min(X)): raise ValueError("It contains NaN.")
        y = np.atleast_1d(y)

        if y.ndim == 1:     # convert to one hot code
            from sklearn.preprocessing import OneHotEncoder as OHE
            y = np.reshape(y, (-1, 1))
            model = OHE(sparse=False)
            y = np.array(model.fit_transform(y))

        self.n_samples, self.n_features = X.shape
        self.n_input = self.n_features

        self.__variables()
        # construct network
        _, cost_op, optimizer_op = self.__network()

        init = tf.initialize_all_variables()
        # create session
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True
        self.__sess = tf.Session(config=config)
        if self.use_gpu: self.device = '/gpu:' + str(self.gpu_id)
        else: self.device = '/cpu'
        logger.info('Used device: ' + self.device)
        with tf.Graph().as_default():
            with tf.device(self.device):
                self.__sess.run(init)
                for epoch in range(self.training_epochs):
                    for batch_X, batch_y in self.__next_batch(X, y):
                        cost, _ = self.__sess.run([cost_op, optimizer_op], \
                                feed_dict={self.__X: batch_X, self.__y: batch_y})
                    if epoch % self.display_step == 0:
                        logger.info("Epoch: %04d, cost = %.6f"%(epoch+1,cost))
                logger.info("Optimization Finished!")
        self.fitted = True
        
        return self
Пример #6
0
def read_data(dataset):
    x = dataset.iloc[:, :-1].values
    y = dataset.iloc[:, -1].values

    ct = col_trans(transformers=[('encoder', OHE(), [3])],
                   remainder="passthrough")
    x = np.array(ct.fit_transform(x))
    x = check_dummy_variable(x)
    train_data(x, y)
Пример #7
0
def one_encoder(X_train, X_valid, object_cols):
    encoder = OHE(handle_unknown='ignore', sparse=False)
    OH_cols_train = pd.DataFrame(encoder.fit_transform(X_train[object_cols]))
    OH_cols_valid = pd.DataFrame(encoder.transform(X_valid[object_cols]))
    OH_cols_train.index = X_train.index
    OH_cols_valid.index = X_valid.index
    num_X_train = X_train.drop(object_cols, axis=1)
    num_X_valid = X_valid.drop(object_cols, axis=1)
    OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
    OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
    return OH_X_train, OH_X_valid
Пример #8
0
    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEncoder:

        self.check_requirements(X, y)

        self.preprocessor['categorical'] = OHE(
            # It is safer to have the OHE produce a 0 array than to crash a good configuration
            categories=X['dataset_properties']['categories']
            if len(X['dataset_properties']['categories']) > 0 else 'auto',
            sparse=False,
            handle_unknown='ignore')
        return self
Пример #9
0
    def fit(self, X, y=None):
        if isinstance(X, tuple):
            X, y = X

        self.columns = X.columns[[
            (X[i].nunique() < self.max_values) & (X[i].dtype == 'object')
            for i in X.columns
        ]]

        self.encoders = {
            column:
            OHE(handle_unknown='ignore').fit(X[column].values.reshape(-1, 1))
            for column in self.columns
        }

        return self
Пример #10
0
        for res in seq:
            if res.isalpha():
                test_vector.append(ord(res))
            else:
                test_vector.append(int(res))

        vectenc = pickle.load(open("ohe.sav", "rb"))

        for i in range(len(test_vector) - window + 1):
            test_vector_frames.append(test_vector[i:i + window])
        test_vector_frames = np.array(test_vector_frames)
        test_v_enc = vectenc.transform(test_vector_frames)

        linclf = pickle.load(open("LinearSVC_3SSTRIDE_w21.sav", "rb"))
        out_prediction = linclf.predict(test_v_enc)
        result = []
        for char in out_prediction:
            result.append(chr(char))
        result = ''.join(result)

        with open("LinSVC21_predictions", "a+") as wh:
            wh.write(pid + "\n")
            wh.write(seq[window // 2:len(seq) - window // 2] + "\n")
            wh.write(result + "\n" + "\n")


if __name__ == "__main__":
    window = 21
    encoder = OHE()
    predict_fasta("testset.txt", window)
Пример #11
0
 def encode(self, x):
     if not self._encoder:
         self._encoder = OHE(sparse=False, categories="auto")
     return self._encoder.fit_transform(x)
Пример #12
0
# y = class_le.fit_transform(df2['classlabel'].values)

# print(y)
# print()

df3['classlabel'] = class_le.fit_transform(df3['classlabel'])
print('DataFrame with classlabel mapped to an integer class using Scikit-Learn.')
print('LabelEncoder - note size is not mapped.')
print(df3)
print()

print('Scikit-Learn One-hot Encoder')
df4['size'] = df4['size'].map(size_map)
X = df4[['color', 'size', 'price']].values

color_OHE = OHE()
Y = color_OHE.fit_transform(X[:, 0].reshape(-1,1)).toarray()
print(X)
print()
print(Y)

print()
'''
[['green' 'M' 10.1]
 ['red' 'L' 13.5]
 ['blue' 'XL' 15.3]
 ['green' 'S' 9.1]]

[[0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
Пример #13
0
features = df.iloc[:, :-1].values
labels = df.iloc[:, -1:].values

from sklearn.preprocessing import Imputer as ip
imp = ip(missing_values='NaN', strategy="median", axis=0)
imp = imp.fit(features[:, 1:2])

features[:, 1:2] = imp.transform(features[:, 1:2])

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le = le.fit(features[:, 0])
features[:, 0] = le.transform(features[:, 0])

from sklearn.preprocessing import OneHotEncoder as OHE
ohe = OHE(categorical_features=[0])
features = ohe.fit_transform(features).toarray()

labels = le.fit_transform(labels)

from sklearn.model_selection import train_test_split as TTS
x_train, x_test, y_train, y_test = TTS(features,
                                       labels,
                                       test_size=0.4,
                                       random_state=0)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
Пример #14
0
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier as KNC

df = pd.read_csv("../dataset/train.csv")
y = df['ACTION']
del df['ACTION']

X_test = pd.read_csv("../dataset/test.csv")
employee_id = X_test['id']
del X_test['id']

X = df[(df['RESOURCE'] == X_test['RESOURCE'])].dropna(how='all')

n_list = []
for column in X.columns:
    n_list.append(X[column].value_counts().size)
n2_list = []
for column in X_test.columns:
    n2_list.append(X_test[column].value_counts().size)

enc = OHE()
X = enc.fit_transform(X)
X_test = enc.fit_transform(X_test)

X_train, X_val, y_train, y_val = cv.train_test_split(X,
                                                     y,
                                                     test_size=0.4,
                                                     random_state=0)
linSVC = svm.LinearSVC()
prediction = linSVC.fit(X, y).predict(X_test)
Пример #15
0
from sklearn.preprocessing import LabelEncoder as LE, OneHotEncoder as OHE
import numpy as np

a = np.array([[0, 1, 100], [1, 2, 200], [2, 3, 400]])

oh = OHE(categorical_features=[0, 1])
a = oh.fit_transform(a).toarray()

a[:, 1:]

idx_to_delete = [0, 3]
indices = [i for i in range(a.shape[-1]) if i not in idx_to_delete]
a[:, indices]
Пример #16
0
# %%
x = Data.iloc[:, 0:10]
y = Data.iloc[:, -1]


# %%
x.head()


# %%
y.head()


# %%
ohe = OHE(sparse=False, categorical_features= [1])
x = ohe.fit_transform(x)


# %%
x


# %%
x_dummy = pd.DataFrame(x)
x_dummy.head()


# %%
x_dummy = x_dummy.drop(labels=[1],axis=1)
x_dummy.head()
Пример #17
0
 def __init__(self, data, labels):
     self._data = data
     self._labels = labels
     self._label_encoder = OHE(sparse=False)
     self._one_hot = self._label_encoder.fit_transform(self._labels)
Пример #18
0
 def __init__(self, **kwargs):
     r"""Initialize feature encoder.
     """
     self.__one_hot_encoder = OHE(handle_unknown='ignore')
Пример #19
0
def evitram():
    # Restore pretrained model
    restorestr = pxfinestr.split('.meta')[0]

    # Save model str
    evitramstr = evitramfinestr.split('.meta')[0]

    # Load pretrained evidence representations for all sources
    K = []
    for e in sys.argv[3:]:
        cp2 = utils.load_config(e)
        K.append(cp2.get('Experiment', 'PX_Z_TRAIN'))

    sect = 'Experiment'
    ev_paths = [cp.get(sect, i) for i in cp.options(sect) if 'evidence' in i]

    if cp.get('Experiment', 'PREFIX') == 'MNIST' or \
        cp.get('Experiment', 'PREFIX') == 'AMNIST':
        evitram_dict = ConvAE.build_EviTRAM(cp, SEED)
    elif cp.get('Experiment', 'PREFIX') == 'WEATHER-TEMPO':
        evitram_dict = WConvAE.build_EviTRAM(cp, SEED)
    else:
        # Layerwise autoencoder number
        ae_ids = [str(i) for i in xrange(cp.getint('Experiment', 'AENUM'))]
        evitram_dict = SAE.build_EviTRAM(cp, ae_ids, SEED)

    # Get variables to restore from pretrained model P(x) Encoder
    var_list = tf.trainable_variables()

    for ev_path_id, ev_path in enumerate(ev_paths):
        if cp.get('Experiment', 'PREFIX') == 'MNIST' or \
                cp.get('Experiment', 'PREFIX') == 'AMNIST' or \
                cp.get('Experiment', 'PREFIX') == 'WEATHER-TEMPO':
            # Prepare "restore" variable list
            for v in var_list:
                if v.name == 'Pre_Q' + str(ev_path_id) + '/kernel:0':
                    var_list.remove(v)
            for v in var_list:
                if v.name == 'Pre_Q' + str(ev_path_id) + '/bias:0':
                    var_list.remove(v)
        else:
            # Prepare "restore" variable list
            for v in var_list:
                if v.name == 'Pre_Q' + str(ev_path_id) + '/kernel:0':
                    var_list.remove(v)
            for v in var_list:
                if v.name == 'Pre_Q' + str(ev_path_id) + '/bias:0':
                    var_list.remove(v)
            for v in var_list:
                if v.name == 'Pre_Comp_Q' + str(ev_path_id) + '/kernel:0':
                    var_list.remove(v)
            for v in var_list:
                if v.name == 'Pre_Comp_Q' + str(ev_path_id) + '/bias:0':
                    var_list.remove(v)

    ##########################################################
    # Tensorboard (comment / uncomment)
    ##########################################################

    from datetime import datetime

    now = datetime.utcnow().strftime("%m-%d_%H-%M:%S")
    root_logdir = cp.get('Experiment', 'ModelOutputPath')
    logdir = "{}/{}{}-{}/".format(
        root_logdir,
        cp.get('Experiment', 'PREFIX') + '_' +
        cp.get('Experiment', 'Enumber') + '_cond', sys.argv[2], now)
    tf.summary.scalar(name='cond loss', tensor=evitram_dict['evitram_loss'])
    tf.summary.scalar(name='recon loss', tensor=evitram_dict['px_mse'])
    summary = tf.summary.merge_all()
    file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

    ##########################################################

    # Initialize & restore P(x) AE weights
    init = tf.global_variables_initializer()
    saver = tf.train.Saver(var_list)
    saverCOND = tf.train.Saver()

    # Task outcomes
    #  EV = [np.load(i) for i in K]

    K = utils.load_evidence(cp2.get('Experiment', 'EVIDENCEDATAPATH'))
    if cp.get('Experiment', 'PREFIX') == 'WEATHER-TEMPO':
        from sklearn.preprocessing import OneHotEncoder as OHE
        EV = K.train.labels
        EV = np.repeat(EV, cp.getint('Input', 'Frames'))
        EV = OHE(sparse=False).fit_transform(EV.reshape(len(EV), 1))
        EV = [EV]
    else:
        EV = K.train.one
        # EV = EV[p]
        EV = [EV]

    # Start Session
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        # Init values
        init.run()
        # Restore finetuned model
        saver.restore(sess, restorestr)

        train_dict = {
            'cp': cp,
            'sess': sess,
            'data': XX_full,
            'sumr': summary,
            'savestr': evitramstr,
            'saver': saverCOND,
            'fw': file_writer,
            'EV': EV,
            'ev_paths': ev_paths
        }

        if cp.get('Experiment', 'PREFIX') == 'MNIST' or \
                cp.get('Experiment', 'PREFIX') == 'AMNIST':
            ConvAE.evitram_train(train_dict, evitram_dict)
        elif cp.get('Experiment', 'PREFIX') == 'WEATHER-TEMPO':
            WConvAE.evitram_train(train_dict, evitram_dict)
        else:
            SAE.evitram_train(train_dict, evitram_dict)

        # Get batch size for batch output save
        batch_size = train_dict['cp'].getint('Hyperparameters', 'BatchSize')

        if cp.get('Experiment', 'PREFIX') == 'MNIST' or \
                cp.get('Experiment', 'PREFIX') == 'AMNIST':
            # Save hidden/output layer results for pipeline training
            px_Z_latent = utils.run_OOM(sess,
                                        evitram_dict['conv_in'],
                                        XX_full,
                                        evitram_dict['conv_z'],
                                        batch_size=batch_size)
        elif cp.get('Experiment', 'PREFIX') == 'WEATHER-TEMPO':
            px_Z_latent = utils.run_OOM(sess,
                                        evitram_dict['conv_in'],
                                        XX_full,
                                        evitram_dict['conv_z'],
                                        batch_size=batch_size)
        else:
            px_Z_latent = utils.run_OOM(sess,
                                        evitram_dict['sda_in'],
                                        XX_full,
                                        evitram_dict['sda_hidden'],
                                        batch_size=batch_size)
        #  utils.save_OOM(sess, pae_dict['conv_in'], XX_full,
        #  pae_dict['conv_out'],
        #  path=cp.get('Experiment', 'PX_XREC_TRAIN'),
        #  batch_size=batch_size)

    # Print clustering ACC
    utils.log_accuracy(cp, YY_full, px_Z_latent, 'COND - ACC FULL', SEED)

    # Print clustering NMI
    utils.log_NMI(cp, YY_full, px_Z_latent, 'COND - NMI FULL', SEED)

    sess.close()