Exemplo n.º 1
0
def sample_dataset(file, amount):
    """
    Sample the given amount of data from the file.

    Args:
        file(str): File to be sampled.
        amount(int): Amount of data to be drawn from the file.
    """

    # Load and convert each title to lowercase.
    data = readlines(file, delimiter="\t", lower=True)
    # Sample
    sample_data(file, data, amount=amount)
Exemplo n.º 2
0
def prepare_data(df, outliers, inliers, seed, fixed_cont,
                 labeled_data, n_oe, oe_path, doc2vec_model, **kwargs):
    print("Only use classes that are in inliers or outliers")
    df = df.where(df.target.isin(
        outliers+inliers)).dropna()
    # label data as inliers and outliers (for scoring) and whether
    # they have labels or not (semi-supervised)
    df = label_data(df, seed, labeled_data, outliers)

    if fixed_cont:
        df = sample_data(df, 1.0, fixed_cont, seed)
        print("Data after adjusting for fixed contamination:\n")
        print(df.groupby(['label', 'outlier_label']).size(
        ).reset_index().rename(columns={0: 'count'}), "\n")

    if n_oe:
        df_oe = get_outlier_data(oe_path, n_oe, seed=42)
        df_oe["vecs"] = doc2vec_model.vectorize(df_oe["text"])
        df = df.append(df_oe)

    if -1 in df.label.unique() and df.label.value_counts()[-1] != df.shape[0]:
        if df[(df.label == 0) & (df.outlier_label == -1)].shape[0] == 0:
            print("Adding missing sample for labeled outlier")
            df.loc[((df.label == -1) & (df.outlier_label == -1)
                    ).idxmax(), 'label'] = 0

    print("Training data:\n", df.groupby(['label', 'outlier_label']).size(
    ).reset_index().rename(columns={0: 'count'}), "\n\n")

    return df
Exemplo n.º 3
0
from sklearn.metrics import r2_score

# Data Factory
DF = Data_Factory_Base()
dim = 3
batch_size = 40
train_num = 10000
train_data = DF.convex_1(dim=dim, num=3 * batch_size)
x_train = train_data[:, :-1].astype('float32')
y_train = train_data[:, -1].astype('float32')
test_data = DF.convex_1(dim=dim, num=1 * batch_size)
x_test = test_data[:, :-1].astype('float32')
y_test = test_data[:, -1].astype('float32')

# dataset 2d
sampled_data = sample_data(train_data, sample_num=train_num)
feed_data = tf.data.Dataset.from_tensor_slices(sampled_data).batch(batch_size)

# GP
kernel = tfp.math.psd_kernels.ExponentiatedQuadratic(
    amplitude=tf.Variable(1.0, dtype=np.float32, name="amplitude"),
    length_scale=tf.Variable(1.0, dtype=np.float32, name="length_scale"),
)  # k(x, y) = amplitude**2 * exp(-||x - y||**2 / (2 * length_scale**2)) # equals to RBF

gp = tfp.distributions.GaussianProcess(kernel)

# define training process
model_2d = ContEncoder(dest_dim=dim - 1, original_dim=dim)
opt_2d = tf.keras.optimizers.Adam(learning_rate=2e-1)

train_gp_loop_2d(
Exemplo n.º 4
0
g_optim = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(
    g_loss, var_list=g_vars)

# End: Build model.
################################################################################

# Start session.
sess = tf.Session()
sess.run(tf.global_variables_initializer())

if not os.path.exists(log_dir):
    os.makedirs(log_dir)

# train()
for it in range(max_iter):
    x_batch, w_batch = sample_data(data_normed, data_raw_weights, batch_size)
    z_batch = get_sample_z(batch_size, noise_dim)

    for _ in range(5):
        _, d_logit_real_, d_logit_fake_, d_loss_, g_loss_ = sess.run(
            [d_optim, d_logit_real, d_logit_fake, d_loss, g_loss],
            feed_dict={
                z: z_batch,
                x: x_batch
            })
    for _ in range(1):
        _, d_logit_real_, d_logit_fake_, d_loss_, g_loss_ = sess.run(
            [g_optim, d_logit_real, d_logit_fake, d_loss, g_loss],
            feed_dict={
                z: z_batch,
                x: x_batch
import config
import parse_movies
import utils
import numpy as np

import os

from sklearn.naive_bayes import MultinomialNB
#from sklearn.feature_extraction import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

all_movies = list(
    parse_movies.load_all_movies(os.path.join(config.baseDir,
                                              config.data_file)))
#sample the data to 6000 for each decade from 1930 to 2010
sampled_movies = utils.sample_data(all_movies, 6000)

#split the data to train and test datasets
train_data = []
test_data = []
flip = True
for m in sampled_movies:
    if (flip):
        train_data.append(m)
        flip = False
    else:
        test_data.append(m)
        flip = True

#===============================================
#4a Use sklearn library to train the data
Exemplo n.º 6
0
# Start session.
sess = tf.Session()
sess.run(tf.global_variables_initializer())

if not os.path.exists(log_dir):
    os.makedirs(log_dir)

# train()
# Start clock to time execution in chunks of log_step steps.
t0 = time.time()
for step in range(max_step):
    #####
    # BEGINNING OF TIMED SEGMENT

    x_batch_preup, w_batch_preup = sample_data(
        data_normed, data_raw_weights, batch_size)
    z_batch = get_sample_z(batch_size, noise_dim)

    # UPSAMPLE WITHIN BATCH.
    x_batch = x_batch_preup[:]
    if sampling == 'random':
        # Upsample then randomly select batch_size to use.
        #for x_, w_ in zip(x_batch_preup, w_batch_preup):
        #    k = int(round(w_))
        #    for _ in range(k - 1):
        #        x_batch.append(x_)
        #x_batch = np.reshape(x_batch, [-1, data_dim])
        #x_batch = x_batch[np.random.choice(len(x_batch), batch_size)]
        for x_, w_ in zip(x_batch_preup, w_batch_preup):
            k = int(round(w_))
            for _ in range(k - 1):
Exemplo n.º 7
0
def train():
    for i in range(train_iter):
        ts = time.time()
        print "[{}%]".format(i / float(train_iter) * 100)
        mini_batch = []
        idxs = []
        is_weight = []
        old_q = []
        _mini_batch, _idxs, _is_weight = utils.sample_data(
            suction_1_memory, mini_batch_size)
        mini_batch += _mini_batch
        idxs += _idxs
        is_weight += list(_is_weight)
        tmp = [idx - memory_capacity[0] - 1 for idx in _idxs]
        suction_1_sampled[tmp] += 1
        _mini_batch, _idxs, _is_weight = utils.sample_data(
            suction_2_memory, mini_batch_size)
        mini_batch += _mini_batch
        idxs += _idxs
        is_weight += list(_is_weight)
        tmp = [idx - memory_capacity[1] - 1 for idx in _idxs]
        suction_2_sampled[tmp] += 1
        _mini_batch, _idxs, _is_weight = utils.sample_data(
            gripper_memory, mini_batch_size)
        mini_batch += _mini_batch
        idxs += _idxs
        is_weight += list(_is_weight)
        tmp = [idx - memory_capacity[2] - 1 for idx in _idxs]
        gripper_sampled[tmp] += 1
        for j in range(len(mini_batch)):
            color = cv2.imread(mini_batch[j].color)
            depth = np.load(mini_batch[j].depth)
            pixel_index = mini_batch[j].pixel_idx
            next_color = cv2.imread(mini_batch[j].next_color)
            next_depth = np.load(mini_batch[j].next_depth)
            action_str, rotate_idx = utils.get_action_info(pixel_index)
            old_q.append(
                trainer.forward(color,
                                depth,
                                action_str,
                                False,
                                rotate_idx,
                                clear_grad=True)[0, pixel_index[1],
                                                 pixel_index[2]])
            reward = mini_batch[j].reward
            td_target = trainer.get_label_value(reward, next_color, next_depth,
                                                mini_batch[j].is_empty,
                                                pixel_index[0])
            loss_ = trainer.backprop(color, depth, pixel_index, td_target,
                                     is_weight[j], mini_batch_size, j == 0,
                                     j == len(mini_batch) - 1)
        # Update priority
        for j in range(len(mini_batch)):
            color = cv2.imread(mini_batch[j].color)
            depth = np.load(mini_batch[j].depth)
            pixel_index = mini_batch[j].pixel_idx
            next_color = cv2.imread(mini_batch[j].next_color)
            next_depth = np.load(mini_batch[j].next_depth)
            reward = mini_batch[j].reward
            td_target = trainer.get_label_value(reward, next_color, next_depth,
                                                mini_batch[j].is_empty,
                                                pixel_index[0])
            action_str, rotate_idx = utils.get_action_info(pixel_index)
            new_value = trainer.forward(color,
                                        depth,
                                        action_str,
                                        False,
                                        rotate_idx,
                                        clear_grad=True)[0, pixel_index[1],
                                                         pixel_index[2]]
            if j / mini_batch_size == 0:
                suction_1_memory.update(idxs[j], td_target - new_value)
            elif j / mini_batch_size == 1:
                suction_2_memory.update(idxs[j], td_target - new_value)
            else:
                gripper_memory.update(idxs[j], td_target - new_value)
            #print "Q value: {} -> {}| TD target: {}".format(old_q[j], new_value, td_target)
        if (i + 1) % save_freq == 0:
            print "Save model"
            torch.save(trainer.behavior_net.state_dict(),
                       save_root + "/{}.pth".format(i + 1))
            color = cv2.imread(compare_color)
            depth = np.load(compare_depth)
            suck_1_prediction, suck_2_prediction, grasp_prediction = trainer.forward(
                color, depth, is_volatile=True)
            heatmaps, mixed_imgs = utils.save_heatmap_and_mixed(
                suck_1_prediction, suck_2_prediction, grasp_prediction,
                feat_path, mixed_path, color, i + 1)
            np.savetxt(save_root + "/suction_1_sampled.csv",
                       suction_1_sampled,
                       delimiter=",")
            np.savetxt(save_root + "/suction_2_sampled.csv",
                       suction_2_sampled,
                       delimiter=",")
            np.savetxt(save_root + "/gripper_sampled.csv",
                       gripper_sampled,
                       delimiter=",")
        if (i + 1) % copy_target_net == 0:
            trainer.target_net.load_state_dict(
                trainer.behavior_net.state_dict())
        print "Took {} seconds".format(time.time() - ts)
Exemplo n.º 8
0
    def train(self,
              X,
              n_iter=1000,
              w0=None,
              rate=0.01,
              alpha=0.5,
              mu=1e-6,
              sample=False,
              n_samples=100,
              evidence=None,
              warm_starts=False,
              tol=1e-6,
              verbose=True):
        """
        Perform SGD wrt the weights w
        * n_iter:      Number of steps of SGD
        * w0:          Initial value for weights w
        * rate:        I.e. the SGD step size
        * alpha:       Elastic net penalty mixing parameter (0=ridge, 1=lasso)
        * mu:          Elastic net penalty
        * sample:      Whether to sample or not
        * n_samples:   Number of samples per SGD step
        * evidence:    Ground truth to condition on
        * warm_starts:
        * tol:         For testing for SGD convergence, i.e. stopping threshold
        """
        self.X_train = X

        # Set up stuff
        N, M = X.shape
        print "=" * 80
        print "Training marginals (!= 0.5):\t%s" % N
        print "Features:\t\t\t%s" % M
        print "=" * 80
        Xt = X.transpose()
        Xt_abs = sparse_abs(Xt) if sparse.issparse(Xt) else np.abs(Xt)
        w0 = w0 if w0 is not None else np.ones(M)

        # Initialize training
        w = w0.copy()
        g = np.zeros(M)
        l = np.zeros(M)
        g_size = 0

        # Gradient descent
        if verbose:
            print "Begin training for rate={}, mu={}".format(rate, mu)
        for step in range(n_iter):

            # Get the expected LF accuracy
            t, f = sample_data(X, w,
                               n_samples=n_samples) if sample else exact_data(
                                   X, w, evidence)
            p_correct, n_pred = transform_sample_stats(Xt, t, f, Xt_abs)

            # Get the "empirical log odds"; NB: this assumes one is correct, clamp is for sampling...
            l = np.clip(log_odds(p_correct), -10, 10)

            # SGD step with normalization by the number of samples
            g0 = (n_pred * (w - l)) / np.sum(n_pred)

            # Momentum term for faster training
            g = 0.95 * g0 + 0.05 * g

            # Check for convergence
            wn = np.linalg.norm(w, ord=2)
            g_size = np.linalg.norm(g, ord=2)
            if step % 250 == 0 and verbose:
                print "\tLearning epoch = {}\tGradient mag. = {:.6f}".format(
                    step, g_size)
            if (wn < 1e-12 or g_size / wn < tol) and step >= 10:
                if verbose:
                    print "SGD converged for mu={} after {} steps".format(
                        mu, step)
                break

            # Update weights
            w -= rate * g

            # Apply elastic net penalty
            w_bias = w[-1]
            soft = np.abs(w) - mu
            ridge_pen = (1 + (1 - alpha) * mu)

            #          \ell_1 penalty by soft thresholding        |  \ell_2 penalty
            w = (np.sign(w) *
                 np.select([soft > 0], [soft], default=0)) / ridge_pen

            # Don't regularize the bias term
            if self.bias_term:
                w[-1] = w_bias

        # SGD did not converge
        else:
            if verbose:
                print "Final gradient magnitude for rate={}, mu={}: {:.3f}".format(
                    rate, mu, g_size)

        # Return learned weights
        self.w = w
Exemplo n.º 9
0
#==============================================
# 2d. PMF of P(Y|X"the">0)
#==============================================
pmf, data_year = cal_pmf(all_movies, 'the')
n = len(data_year)
x = []
y = []
for year, amount in pmf.iteritems():
    x.append(year)
    y.append(float(amount) / float(n))
utils.histogram(x, y, 'Decade', 'PMF', 'Blananced PMF of P(Y|X"the">0)')
print 'Blananced PMF of P(Y|X"the">0) done'

#sample the data to 6000 for each decade from 1930 to 2010
sampled_movies = utils.sample_data(all_movies, 6000, (1930, 2010))

#==============================================
# 2e. PMF of P(Y|X"radio">0)
#==============================================
pmf, data_year = cal_pmf(sampled_movies, 'radio')
n = len(data_year)
x = []
y = []
for year, amount in pmf.iteritems():
    x.append(year)
    y.append(float(amount) / float(n))
utils.histogram(x, y, 'Decade', 'PMF', 'Blananced PMF of P(Y|X"radio">0)')
print 'Blananced PMF of P(Y|X"radio">0) done'

#==============================================
Exemplo n.º 10
0
import config
import parse_movies
import utils
import numpy as np

import os

from sklearn.naive_bayes import MultinomialNB
#from sklearn.feature_extraction import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

all_movies = list(parse_movies.load_all_movies(os.path.join(config.baseDir,config.data_file)))
#sample the data to 6000 for each decade from 1930 to 2010
sampled_movies = utils.sample_data(all_movies, 6000)

#split the data to train and test datasets
train_data = []
test_data = []
flip = True
for m in sampled_movies:
    if (flip):
        train_data.append(m)
        flip = False
    else:
        test_data.append(m)
        flip = True

#===============================================
#4a Use sklearn library to train the data
#for everyiterm in training data, find the bag of word, convert to feature vector, correspond to year
sumList = []
Exemplo n.º 11
0
            # Jump to the previous cycle for restart.
            if epoch == initial_epoch:
                if cycle < initial_cycle:
                    continue
            retval_list = mp.Manager().list()  # Is this needed?
            # List of multiprocessing.managers.ListProxy to collect losses
            retval_list = [mp.Manager().list() for i in range(args.ncpus)]
            st = time.time()
            processes = []
            for pid in range(args.ncpus):
                # Sample keys without considering activeness.
                if args.active_ratio is None:
                    keys = random.sample(id_to_smiles.keys(), args.item_per_cycle)
                # Sample active and inactive keys by the required ratio.
                else:
                    keys = utils.sample_data(id_to_whole_conditions, args.item_per_cycle, args.active_ratio)

                # Property (descriptor) values work as conditions.
                # We need both of whole and scaffold values.
                # whole_conditions := [
                #     [ value1, value2, ... ],  # condition values of whole 1
                #     [ value1, value2, ... ],  # condition values of whole 2
                #     ... ]
                whole_conditions = [id_to_whole_conditions[key] for key in keys]
                scaffold_conditions = [id_to_scaffold_conditions[key] for key in keys]

                # SMILESs of whole molecules and scaffolds.
                wholes = [id_to_smiles[key][0] for key in keys]
                scaffolds = [id_to_smiles[key][1] for key in keys]

                proc = mp.Process(target=train, args=(shared_model, shared_optimizer, wholes, scaffolds, whole_conditions, scaffold_conditions, pid, retval_list, args))
Exemplo n.º 12
0
                              oe_path=oe_path,
                              doc2vec_model=doc2vec_model,
                              **params)
            # combine
            if params["weakly_supervised"]:
                df = df.append(df_weakly).reset_index(drop=True)

            # label test set
            df_test["label"] = 0
            df_test.loc[~df_test.target.isin(params["test_outliers"]),
                        "label"] = 1
            df_test["outlier_label"] = -1
            df_test.loc[~df_test.target.isin(params["test_outliers"]),
                        "outlier_label"] = 1
            # sampling the df_test set
            df_test = sample_data(df_test, 1.0, 0.1, 42)
            df_test = df_test[df_test.target.isin(params["test_outliers"] +
                                                  params["test_inliers"])]

            print("df_train")
            print(df.label.value_counts())
            print(df.target.value_counts())

            print("df_test")
            print(df_test.label.value_counts())
            print(df_test.target.value_counts())

            #####
            # train
            #####
            # UMAP Train
Exemplo n.º 13
0
 if suction_1_memory_buffer.length > mini_batch_size and \
    suction_2_memory_buffer.length > mini_batch_size and \
    gripper_memory_buffer.length   > mini_batch_size:
     sufficient_exp += 1
     if (sufficient_exp - 1) % learning_freq == 0:
         back_ts = time.time()
         if arduino: arduino.write("b 1000")
         learned_times += 1
         mini_batch = []
         idxs = []
         is_weight = []
         old_q = []
         td_target_list = []
         if specific_tool is not None:
             if specific_tool == 0:
                 mini_batch, idxs, is_weight = utils.sample_data(
                     suction_1_memory_buffer, mini_batch_size)
             elif specific_tool == 1:
                 mini_batch, idxs, is_weight = utils.sample_data(
                     suction_2_memory_buffer, mini_batch_size)
             elif specific_tool == 2:
                 mini_batch, idxs, is_weight = utils.sample_data(
                     gripper_memory_buffer, mini_batch_size)
         else:
             _mini_batch, _idxs, _is_weight = utils.sample_data(
                 suction_1_memory_buffer, mini_batch_size)
             mini_batch += _mini_batch
             idxs += _idxs
             is_weight += list(_is_weight)
             _mini_batch, _idxs, _is_weight = utils.sample_data(
                 suction_2_memory_buffer, mini_batch_size)
             mini_batch += _mini_batch