Exemplo n.º 1
0
    def optimize(self, opt_data):
        #self.sigma_p = 1
        #self.sigma_y = 1
        #self.C = 1
        assert (opt_data.instances_to_keep is None or
                opt_data.instances_to_keep.sum() == 0), 'Not implemented yet!'
        W_p = density.compute_kernel(opt_data.X, None, self.sigma_p)
        W_y = array_functions.make_rbf(opt_data.X, self.sigma_y)
        n = W_p.shape[0]
        selected = array_functions.false(n)
        y_true = self.f_x
        p_true = self.p_x
        for i in range(opt_data.subset_size):
            new_scores = np.zeros(n)
            new_scores[:] = np.inf
            for j in range(n):
                if selected[j]:
                    continue
                b = array_functions.false(n)
                b[j] = True
                new_scores[j] = self.evaluate_selection(W_p, W_y, b | selected, y_true, p_true)
            best_idx = new_scores.argmin()
            selected[best_idx] = True

        self.selected = selected
        if selected.sum() < opt_data.subset_size:
            # print 'Empty clusters'
            pass
        # self.learned_distribution = compute_p(selected, opt_data)
        self.learned_distribution = selected
        self.optimization_value = 0
Exemplo n.º 2
0
 def optimize_for_data(self, W, num_to_select):
     selected = array_functions.false(W.shape[0])
     for i in range(num_to_select):
         new_scores = np.zeros(W.shape[0])
         new_scores[:] = -np.inf
         for j in range(W.shape[0]):
             if selected[j]:
                 continue
             b = array_functions.false(W.shape[0])
             b[j] = True
             new_scores[j] = self.evaluate_selection(W, selected | b)
         best_idx = new_scores.argmax()
         selected[best_idx] = True
     return selected
Exemplo n.º 3
0
    def optimize(self, opt_data):
        assert opt_data.instances_to_keep is None, 'Not implemented yet!'
        W_x = array_functions.make_rbf(opt_data.X, self.sigma_x)
        W = W_x
        if not self.no_f_x:
            W_y = array_functions.make_rbf(opt_data.Y, self.sigma_y)
            W = W_x * W_y
        n = W.shape[0]
        selected = array_functions.false(W.shape[0])
        splits = [array_functions.true(n)]
        num_per_split = [opt_data.subset_size]
        if self.num_class_splits is not None:
            assert self.num_class_splits == 2
            I1 = opt_data.Y <= opt_data.Y.mean()
            splits = [I1, ~I1]
            num_per_split = [opt_data.subset_size/2, opt_data.subset_size/2]
        for split, num in zip(splits, num_per_split):
            W_split = W[np.ix_(split, split)]
            split_selections = self.optimize_for_data(W_split, num)
            split_inds = split.nonzero()[0]
            selected[split_inds[split_selections]] = True

        #selected = self.compute_centroids_for_spectral_clustering(W, cluster_inds)
        self.W = W
        self.selected = selected
        if selected.sum() < opt_data.subset_size:
            #print 'Empty clusters'
            pass
        #self.learned_distribution = compute_p(selected, opt_data)
        self.learned_distribution = selected
        self.optimization_value = 0
def create_synthetic_hypothesis_transfer(n=500, p=50, kt=1, ks=1, sigma=1.0, sigma_s=0.3):
    wt = np.random.normal(0, sigma, p)
    all_data, w_eff = create_synthetic_linear_classification(n=n, p=p, sigma=sigma, w=wt)
    x = all_data.x
    all_data.data_set_ids = np.zeros(n)
    wt = w_eff
    data_set_counter = 1
    diffs = []
    is_target = array_functions.false(kt + ks)
    is_target[:kt] = True
    all_data.true_w = np.zeros((ks + kt + 1, p))
    all_data.true_w[0, :] = wt
    for i, val in enumerate(is_target):
        data_set_id = data_set_counter
        data_set_counter += 1
        if val:
            ws = wt + np.random.normal(0, sigma_s, p)
            ws = wt
        else:
            ws = np.random.normal(0, sigma, p)
        source_data, ws = create_synthetic_linear_classification(w=ws, x=x)
        source_data.data_set_ids = data_set_id * np.ones(n)
        # source_data.true_y *= (i+2)
        source_data.y = source_data.true_y
        all_data.combine(source_data)
        diff = norm(wt / norm(wt) - ws / norm(ws))
        diffs.append(diff)
        all_data.true_w[data_set_id, :] = ws
    all_data.true_w = all_data.true_w.T
    all_data.metadata = dict()
    all_data.metadata["true_w"] = all_data.true_w
    s = synthetic_hypothesis_transfer_class_file % (
        str(n) + "-" + str(p) + "-" + str(sigma) + "-" + str(sigma_s) + "-" + str(kt) + "-" + str(ks)
    )
    helper_functions.save_object(s, all_data)
Exemplo n.º 5
0
def split_data(file, configs):
    data = helper_functions.load_object(file)
    splitter = DataSplitter()
    splitData = data_lib.SplitData()
    splitData.data = data
    num_splits = 30
    perc_train = .8
    keep_for_splitting = None
    if configs.split_data_set_ids is not None:
        keep_for_splitting = array_functions.false(data.n)
        keep_for_splitting[data.data_set_ids == 0] = True
    #Pretend data_set_ids is a label vector to ensure each data set is split equally
    if data.is_regression and data.data_set_ids is not None:
        assert len(data.data_set_ids) == data.n
        is_regression = False
        splitData.splits = splitter.generate_splits(
            data.data_set_ids,
            num_splits,
            perc_train,
            is_regression,
            keep_for_splitting
        )
    else:
        splitData.splits = splitter.generate_splits(
            data.y,
            num_splits,
            perc_train,
            data.is_regression,
            keep_for_splitting
        )
    splitData.data_set_ids_to_keep = configs.data_set_ids_to_keep
    split_dir = os.path.dirname(file)
    save_file = split_dir + '/split_data.pkl'
    helper_functions.save_object(save_file,splitData)
    return splitData
Exemplo n.º 6
0
 def get_min_range(self):
     ranges = self.get_series_range()
     min_range = ranges[:,0].max()
     max_range = ranges[:,1].min()
     I = array_functions.false(self.n)
     I[min_range:max_range+1] = True
     return self.get_subset(I)
Exemplo n.º 7
0
 def get_min_range(self):
     ranges = self.get_series_range()
     min_range = ranges[:, 0].max()
     max_range = ranges[:, 1].min()
     I = array_functions.false(self.n)
     I[min_range:max_range + 1] = True
     return self.get_subset(I)
Exemplo n.º 8
0
def split_data(file, configs):
    data = helper_functions.load_object(file)
    data.is_regression = configs.is_regression
    splitter = DataSplitter()
    splitData = data_lib.SplitData()
    splitData.data = data
    num_splits = 30
    perc_train = .8
    keep_for_splitting = None
    if configs.split_data_set_ids is not None:
        keep_for_splitting = array_functions.false(data.n)
        keep_for_splitting[data.data_set_ids == 0] = True
    #Pretend data_set_ids is a label vector to ensure each data set is split equally
    if data.is_regression and data.data_set_ids is not None:
        assert len(data.data_set_ids) == data.n
        is_regression = False
        splitData.splits = splitter.generate_splits(data.data_set_ids,
                                                    num_splits, perc_train,
                                                    is_regression,
                                                    keep_for_splitting)
    else:
        splitData.splits = splitter.generate_splits(data.y, num_splits,
                                                    perc_train,
                                                    data.is_regression,
                                                    keep_for_splitting)
    splitData.data_set_ids_to_keep = configs.data_set_ids_to_keep
    split_dir = os.path.dirname(file)
    save_file = split_dir + '/split_data.pkl'
    helper_functions.save_object(save_file, splitData)
    return splitData
Exemplo n.º 9
0
def keep_subset(I, num_to_keep):
    inds = I.nonzero()[0]
    if num_to_keep > inds.size:
        return I
    inds_to_keep = np.random.choice(inds, num_to_keep, replace=False)
    v = array_functions.false(I.size)
    v[inds_to_keep] = True
    return v
Exemplo n.º 10
0
 def sample_from_clusters(self, W, cluster_inds, num_samples):
     v, counts = np.unique(cluster_inds, return_counts=True)
     counts = counts.astype(np.float)
     frequency = counts / counts.sum()
     is_representative = array_functions.false(cluster_inds.size)
     for idx, freq in zip(v, frequency):
         if freq > 1.5 / v.size:
             is_representative[cluster_inds == idx] = True
     if not is_representative.any():
         is_representative[:] = True
     cluster_samples = np.random.choice(np.nonzero(is_representative)[0], num_samples, replace=False)
     return array_functions.make_vec_binary(cluster_samples, cluster_inds.size)
Exemplo n.º 11
0
def subset_1_per_instance_id():
    data = helper_functions.load_object('data_sets/' + create_data_set.adience_aligned_cnn_file)
    to_keep = array_functions.false(data.n)
    all_ids = np.unique(data.instance_ids)
    for id in all_ids:
        has_id = (data.instance_ids == id).nonzero()[0]
        to_keep[has_id[0]] = True
        pass
    to_keep = to_keep & data.is_labeled
    data = data.get_subset(to_keep)
    helper_functions.save_object('data_sets/' + create_data_set.adience_aligned_cnn_1_per_instance_id_file,
                                 data)
    pass
Exemplo n.º 12
0
    def generate_splits(self,
                        y,
                        num_splits=30,
                        perc_train=.8,
                        is_regression=False,
                        keep_for_splitting=None):
        assert y.ndim == 1
        keep_in_train_set = array_functions.false(len(y))
        if keep_for_splitting is not None and len(keep_for_splitting) > 0:
            keep_in_train_set[~keep_for_splitting] = True
            #keep_in_train_set[~array_functions.to_boolean(keep_for_splitting)] = True
        is_labeled = ~np.isnan(y)
        keep_in_train_set[~is_labeled] = True

        n = len(y)
        #if keep_for_splitting is not None:
        #   y_for_split = y[keep_for_splitting]
        #  target_inds = keep_for_splitting.nonzero()[0]
        y_for_split = y[~keep_in_train_set]
        n_for_split = len(y_for_split)
        inds_for_splitting = (~keep_in_train_set).nonzero()[0]
        random_state = None
        if is_regression:
            split = cross_validation.ShuffleSplit(n_for_split,
                                                  num_splits,
                                                  1 - perc_train,
                                                  random_state=random_state)
        else:
            split = cross_validation.StratifiedShuffleSplit(
                y_for_split,
                num_splits,
                1 - perc_train,
                random_state=random_state)
        splits = []
        for train, test in split:
            s = data_lib.Split(n)
            s.is_train[:] = True
            s.is_train[inds_for_splitting[test]] = False
            '''
            if keep_for_splitting is not None:
                s.is_train[:] = True
                s.is_train[target_inds[test]] = False
            else:
                s.is_train[train] = True
                s.is_train[test] = False
            '''
            s.permutation = np.random.permutation(n)
            splits.append(s)

        return splits
Exemplo n.º 13
0
 def create_sampling_distribution(self, base_learner, data, fold_results):
     is_train_unlabeled = data.is_train & (~data.is_labeled)
     is_train_labeled = data.is_train & data.is_labeled
     inds = np.nonzero(is_train_unlabeled)[0]
     inds = inds[:50]
     I = array_functions.false(data.n)
     I[inds] = True
     x = data.x[I, :]
     x_labeled = data.x[is_train_labeled, :]
     if self.use_labeled:
         x_all = np.vstack((x, x_labeled))
         self.transform.fit(x_all)
         x = self.transform.transform(x)
         x_labeled = self.transform.transform(x_labeled)
     else:
         x = self.transform.fit_transform(x)
     C = base_learner.params['alpha']
     n = I.sum()
     t0 = np.zeros((n,1))
     opt_data = OptimizationData(x, C)
     if self.use_labeled:
         opt_data.x_labeled = x_labeled
     constraints = [
         {
             'type': 'eq',
             'fun': lambda t: t.sum() - 1
         },
         {
             'type': 'ineq',
             'fun': lambda t: t
         }
     ]
     options = {}
     results = optimize.minimize(
         lambda t: eval_oed(t, opt_data),
         t0,
         method='SLSQP',
         jac=None,
         options=options,
         constraints=constraints
     )
     if results.success:
         t = results.x
     else:
         print 'OED Optimization failed'
         t = np.ones(n)
     t[t < 0] = 0
     t += 1e-4
     t /= t.sum()
     return t, inds
Exemplo n.º 14
0
    def rand_sample(self,perc=.1,to_sample=None):
        if to_sample is None:
            to_sample = array_functions.true(self.n)
        if to_sample.dtype != 'bool':
            I = array_functions.false(self.n)
            I[to_sample] = True
            to_sample = I

        to_keep = (~to_sample).nonzero()[0]
        to_sample = to_sample.nonzero()[0]
        p = np.random.permutation(to_sample.shape[0])
        m = np.ceil(perc*p.shape[0])
        to_use = to_sample[p[:m]]
        to_use = np.hstack((to_use,to_keep))
        return self.get_subset(to_use)
Exemplo n.º 15
0
    def rand_sample(self, perc=.1, to_sample=None):
        if to_sample is None:
            to_sample = array_functions.true(self.n)
        if to_sample.dtype != 'bool':
            I = array_functions.false(self.n)
            I[to_sample] = True
            to_sample = I

        to_keep = (~to_sample).nonzero()[0]
        to_sample = to_sample.nonzero()[0]
        p = np.random.permutation(to_sample.shape[0])
        m = int(np.ceil(perc * p.shape[0]))
        to_use = to_sample[p[:m]]
        to_use = np.hstack((to_use, to_keep))
        return self.get_subset(to_use)
Exemplo n.º 16
0
def subset_1_per_instance_id():
    data = helper_functions.load_object(
        'data_sets/' + create_data_set.adience_aligned_cnn_file)
    to_keep = array_functions.false(data.n)
    all_ids = np.unique(data.instance_ids)
    for id in all_ids:
        has_id = (data.instance_ids == id).nonzero()[0]
        to_keep[has_id[0]] = True
        pass
    to_keep = to_keep & data.is_labeled
    data = data.get_subset(to_keep)
    helper_functions.save_object(
        'data_sets/' +
        create_data_set.adience_aligned_cnn_1_per_instance_id_file, data)
    pass
Exemplo n.º 17
0
 def create_sampling_distribution(self, base_learner, data, fold_results):
     is_train_unlabeled = data.is_train & (~data.is_labeled)
     is_train_labeled = data.is_train & data.is_labeled
     inds = np.nonzero(is_train_unlabeled)[0]
     inds = inds[:50]
     I = array_functions.false(data.n)
     I[inds] = True
     x = data.x[I, :]
     x_labeled = data.x[is_train_labeled, :]
     if self.use_labeled:
         x_all = np.vstack((x, x_labeled))
         self.transform.fit(x_all)
         x = self.transform.transform(x)
         x_labeled = self.transform.transform(x_labeled)
     else:
         x = self.transform.fit_transform(x)
     C = base_learner.params['alpha']
     n = I.sum()
     t0 = np.zeros((n, 1))
     opt_data = OptimizationData(x, C)
     if self.use_labeled:
         opt_data.x_labeled = x_labeled
     constraints = [{
         'type': 'eq',
         'fun': lambda t: t.sum() - 1
     }, {
         'type': 'ineq',
         'fun': lambda t: t
     }]
     options = {}
     results = optimize.minimize(lambda t: eval_oed(t, opt_data),
                                 t0,
                                 method='SLSQP',
                                 jac=None,
                                 options=options,
                                 constraints=constraints)
     if results.success:
         t = results.x
     else:
         print 'OED Optimization failed'
         t = np.ones(n)
     t[t < 0] = 0
     t += 1e-4
     t /= t.sum()
     return t, inds
Exemplo n.º 18
0
 def transform(self, data):
     should_add_noise = array_functions.false(data.n)
     for i in range(self.num_clusters):
         idx = np.random.choice(data.n)
         cluster_inds = array_functions.find_knn(data.x,
                                                 data.x[idx],
                                                 k=self.n_per_cluster)
         should_add_noise[cluster_inds] = True
     if self.save_y_orig:
         data.y_orig = data.true_y.copy()
     if should_add_noise.any():
         if self.flip_labels:
             data.flip_label(should_add_noise)
         else:
             data.true_y[should_add_noise] += self.y_offset
             data.y[should_add_noise] += self.y_offset
     data.is_noisy = should_add_noise
     return data
def create_synthetic_hypothesis_transfer(n=500,
                                         p=50,
                                         kt=1,
                                         ks=1,
                                         sigma=1.0,
                                         sigma_s=.3):
    wt = np.random.normal(0, sigma, p)
    all_data, w_eff = create_synthetic_linear_classification(n=n,
                                                             p=p,
                                                             sigma=sigma,
                                                             w=wt)
    x = all_data.x
    all_data.data_set_ids = np.zeros(n)
    wt = w_eff
    data_set_counter = 1
    diffs = []
    is_target = array_functions.false(kt + ks)
    is_target[:kt] = True
    all_data.true_w = np.zeros((ks + kt + 1, p))
    all_data.true_w[0, :] = wt
    for i, val in enumerate(is_target):
        data_set_id = data_set_counter
        data_set_counter += 1
        if val:
            ws = wt + np.random.normal(0, sigma_s, p)
            ws = wt
        else:
            ws = np.random.normal(0, sigma, p)
        source_data, ws = create_synthetic_linear_classification(w=ws, x=x)
        source_data.data_set_ids = data_set_id * np.ones(n)
        #source_data.true_y *= (i+2)
        source_data.y = source_data.true_y
        all_data.combine(source_data)
        diff = norm(wt / norm(wt) - ws / norm(ws))
        diffs.append(diff)
        all_data.true_w[data_set_id, :] = ws
    all_data.true_w = all_data.true_w.T
    all_data.metadata = dict()
    all_data.metadata['true_w'] = all_data.true_w
    s = synthetic_hypothesis_transfer_class_file % \
        (str(n) + '-' + str(p) + '-' + str(sigma) + '-' + str(sigma_s) + '-' + str(kt) + '-' + str(ks))
    helper_functions.save_object(s, all_data)
Exemplo n.º 20
0
    def generate_splits(self,y,num_splits=30,perc_train=.8,is_regression=False,keep_for_splitting=None):
        assert y.ndim == 1
        keep_in_train_set = array_functions.false(len(y))
        if keep_for_splitting is not None and len(keep_for_splitting) > 0:
            keep_in_train_set[~keep_for_splitting] = True
            #keep_in_train_set[~array_functions.to_boolean(keep_for_splitting)] = True
        is_labeled = ~np.isnan(y)
        keep_in_train_set[~is_labeled] = True

        n = len(y)
        #if keep_for_splitting is not None:
         #   y_for_split = y[keep_for_splitting]
          #  target_inds = keep_for_splitting.nonzero()[0]
        y_for_split = y[~keep_in_train_set]
        n_for_split = len(y_for_split)
        inds_for_splitting = (~keep_in_train_set).nonzero()[0]
        random_state = None
        if is_regression:
            split = cross_validation.ShuffleSplit(n_for_split,num_splits,1-perc_train,random_state=random_state)
        else:
            split = cross_validation.StratifiedShuffleSplit(y_for_split,num_splits,1-perc_train,random_state=random_state)
        splits = []
        for train,test in split:
            s = data_lib.Split(n)
            s.is_train[:] = True
            s.is_train[inds_for_splitting[test]] = False
            '''
            if keep_for_splitting is not None:
                s.is_train[:] = True
                s.is_train[target_inds[test]] = False
            else:
                s.is_train[train] = True
                s.is_train[test] = False
            '''
            s.permutation = np.random.permutation(n)
            splits.append(s)

        return splits
Exemplo n.º 21
0
def create_20ng_data(file_dir=''):
    newsgroups_train = datasets.fetch_20newsgroups(subset='train',
                                                   remove=('headers',
                                                           'footers',
                                                           'quotes'))
    data = data_class.Data()
    short_names = [
        #0
        'A',
        #1-5
        'C1',
        'C2',
        'C3',
        'C4',
        'C5',
        #6
        'M',
        #7-10
        'R1',
        'R2',
        'R3',
        'R4',
        #11-14
        'S1',
        'S2',
        'S3',
        'S4',
        #15
        'O',
        #16-19
        'T1',
        'T2',
        'T3',
        'T4'
    ]
    y = newsgroups_train.target
    #l = [1,2,7,8,12,17]
    #l = [1,2,7,8,12,13]
    #l = [0,1,2,3,4,5,7,8,9,10,11,12,13,14,16,17,18,19]
    l = [0, 1, 2, 7, 8, 11, 12, 16, 17]
    #l = [0, 1, 2, 3, 4, 7, 8, 9, 10,11,12,13,14,16,17,18,19]
    data.label_names = [short_names[i] for i in l]
    I = array_functions.false(len(newsgroups_train.target))
    for i in l:
        I = I | (y == i)
    #I = y == 1 | y == 2 | y == 7 | y == 7 | y == 11 | y == 16
    I = I.nonzero()[0]
    max_df = .5
    min_df = .01
    #max_df = .95
    #min_df = .001
    #max_df = .1
    #min_df = .01
    newsgroups_train.data = [newsgroups_train.data[i] for i in I]
    newsgroups_train.target = newsgroups_train.target[I]
    tf_idf = TfidfVectorizer(stop_words='english',
                             max_df=max_df,
                             min_df=min_df,
                             max_features=max_features)
    vectors = tf_idf.fit_transform(newsgroups_train.data)
    feature_counts = (vectors > 0).sum(0)
    vocab = helper_functions.invert_dict(tf_idf.vocabulary_)
    num_feats = len(vocab)
    vocab = [vocab[i] for i in range(num_feats)]

    #pca = PCA(n_components=pca_feats)
    #v2 = pca.fit_transform(vectors.toarray())
    v2 = vectors.toarray()
    vectors = v2

    y = newsgroups_train.target.copy()
    '''
    y[y==7] = 1
    y[(y==2) | (y==8)] = 2
    y[(y==12) | (y==17)] = 3
    '''
    '''
    y[y == 2] = 1
    y[(y==7) | (y==8)] = 2
    y[(y==12) | (y==13)] = 3
    #I_f = (y==1) | (y==7) | (y==11) | (y==16)
    I_f = array_functions.true(vectors.shape[0])
    f = f_classif
    k_best = SelectKBest(score_func=f, k=pca_feats)
    v2 = k_best.fit_transform(vectors[I_f,:], y[I_f])
    k_best.transform(vectors)
    s = k_best.get_support()
    selected_vocab = [vocab[i] for i in s.nonzero()[0]]
    vocab = selected_vocab
    vectors = v2
    '''

    data.x = vectors
    data.y = newsgroups_train.target
    data.set_train()
    data.set_target()
    data.set_true_y()
    data.is_regression = False
    data.feature_names = vocab
    class_counts = array_functions.histogram_unique(data.y)
    s = ng_raw_data_file
    if file_dir != '':
        s = file_dir + '/' + s
    helper_functions.save_object(s, data)
Exemplo n.º 22
0
    file_name,
    True,
    dtype='str',
    delim=',',
    #num_rows=40000
    num_rows=100000000000)
y_names = [s + ' Mean' for s in [
    'NO2',
    'O3',
    'SO2',
    'CO',
]]
y_inds = []
for name in y_names:
    y_inds.append(array_functions.find_first_element(feat_names, name))
to_keep = array_functions.false(data.shape[0])
date_strs = data[:, find_first_element(feat_names, 'Date Local')]
prev = ''
date_str_to_idx = dict()
date_ids = np.zeros(data.shape[0])
for i, date_str in enumerate(date_strs):
    date_obj = to_date(date_str)
    date_str_to_idx[date_str] = date_obj.toordinal()
    date_ids[i] = date_obj.toordinal()
    if prev != date_str:
        to_keep[i] = True
        prev = date_str
data = data[to_keep, :]
date_strs = date_strs[to_keep]
date_ids = date_ids.astype(np.int)
date_ids = date_ids[to_keep]
Exemplo n.º 23
0
    region_ids_centroids = np.asarray(centroids_data.RegionID)
    region_ids_centroids = region_ids_centroids.astype(np.int)
    pricing_data = string_data.values[:, [year1_idx, year2_idx]]
    pricing_data = vec_replace(pricing_data).astype(np.float)

    #I_data = np.argsort(region_ids_data)
    I_centroids = np.argsort(region_ids_centroids)
    #r_data_sorted = region_ids_data[I_data]
    r_centroids_sorted = region_ids_centroids[I_centroids]
    #assert (r_data_sorted == r_centroids_sorted).all()
    centroid_x = np.asarray(centroids_data.X).astype(np.float)
    centroid_y = np.asarray(centroids_data.Y).astype(np.float)
    locs = np.stack((centroid_x, centroid_y),1)
    locs = locs[I_centroids, :]
    ca_pricing_data = np.zeros((centroid_x.shape[0], 2))
    has_data = array_functions.false(ca_pricing_data.shape[0])
    for i, id in enumerate(r_centroids_sorted):
        if (id == region_ids_data).sum() == 1:
            ca_pricing_data[i, :] = pricing_data[id == region_ids_data, :]
            has_data[i] = True

    locs = locs[has_data, :]
    locations = locs
    pricing_data = ca_pricing_data[has_data, :]

    I = np.isfinite(pricing_data[:, 0]) & np.isfinite(pricing_data[:, 1])
    I &= array_functions.in_range(locations[:,0], day_locs[:,0].min(), day_locs[:,0].max())
    I &= array_functions.in_range(locations[:,1], day_locs[:,1].min(), day_locs[:,1].max())
    #I &= array_functions.in_range(locations[:,0], -123, -121)
    #I &= array_functions.in_range(locations[:,1], 37, 39)
    # I &= (state == 'OR')
Exemplo n.º 24
0
        dtype='str',
        delim=',',
        num_rows=1000000000
    )
    inds_to_use = np.asarray([j for j in range(feat_names_curr.size) if feat_names_curr[j] in feats_to_keep])
    assert inds_to_use.size == len(feats_to_keep)
    data_curr = data_curr[:, inds_to_use]
    feat_names_curr = feat_names_curr[inds_to_use]
    if i == 0:
        feat_names = feat_names_curr
        data = data_curr
        continue

    unique_stations = np.unique(data[:, find_first_element(feat_names, 'STATION')].astype(np.str))
    curr_stations = data_curr[:, find_first_element(feat_names, 'STATION')].astype(np.str)
    to_remove = array_functions.false(data_curr.shape[0])
    for s in np.unique(curr_stations):
        if s not in unique_stations:
            continue
        print 'Found repeated station, removing: ' + s
        to_remove = to_remove | (curr_stations == s)
    data = np.vstack((data, data_curr[~to_remove,:]))
y_names = ['TAVG', 'TMIN', 'TMAX', 'PRCP']
y_inds = []
for name in y_names:
    y_inds.append(array_functions.find_first_element(feat_names, name))
date_strs = data[:, find_first_element(feat_names, 'DATE')]
prev = ''
date_str_to_idx = dict()
date_ids = np.zeros(data.shape[0])
for i, date_str in enumerate(date_strs):
Exemplo n.º 25
0
 def get_range(self, y_range):
     I = array_functions.false(self.n)
     I[y_range[0]:y_range[1]] = True
     return self.get_subset(I)
Exemplo n.º 26
0
 def get_nth(self, n):
     I = array_functions.false(self.n)
     I[::n] = True
     return self.get_subset(I)
Exemplo n.º 27
0
        j for j in range(feat_names_curr.size)
        if feat_names_curr[j] in feats_to_keep
    ])
    assert inds_to_use.size == len(feats_to_keep)
    data_curr = data_curr[:, inds_to_use]
    feat_names_curr = feat_names_curr[inds_to_use]
    if i == 0:
        feat_names = feat_names_curr
        data = data_curr
        continue

    unique_stations = np.unique(
        data[:, find_first_element(feat_names, 'STATION')].astype(np.str))
    curr_stations = data_curr[:, find_first_element(feat_names, 'STATION'
                                                    )].astype(np.str)
    to_remove = array_functions.false(data_curr.shape[0])
    for s in np.unique(curr_stations):
        if s not in unique_stations:
            continue
        print 'Found repeated station, removing: ' + s
        to_remove = to_remove | (curr_stations == s)
    data = np.vstack((data, data_curr[~to_remove, :]))
y_inds = []
for name in y_names:
    y_inds.append(array_functions.find_first_element(feat_names, name))
date_strs = data[:, find_first_element(feat_names, 'DATE')]
prev = ''
date_str_to_idx = dict()
date_ids = np.zeros(data.shape[0])
for i, date_str in enumerate(date_strs):
    date_obj = to_date(date_str)
Exemplo n.º 28
0
 def get_range(self, y_range):
     I = array_functions.false(self.n)
     I[y_range[0]:y_range[1]] = True
     return self.get_subset(I)
Exemplo n.º 29
0
 def get_nth(self, n):
     I = array_functions.false(self.n)
     I[::n] = True
     return self.get_subset(I)
Exemplo n.º 30
0
    '''
    for d in unique_dates:
        times_series_vals[d,i] = y[I[dates_idx == d]].mean()
    pass
    '''
    '''
    for j in I:
        print date_strs[j]
    '''
    '''
    print 'num_items: ' + str(I.size)
    print 'start: ' + date_strs[I[0]]
    print 'end: ' + date_strs[I[-1]]
    '''

has_loc = array_functions.false(unique_series_ids.size)
for i, id in enumerate(unique_series_ids):
    has_loc[i] = id in station_names
times_series_vals = times_series_vals[:, has_loc]
unique_series_ids = unique_series_ids[has_loc]
date_idx = 0
for i in range(0,num_days, 5):
    x = station_locs
    y = times_series_vals[i:120:28,:]
    array_functions.plot_heatmap(x, y.T, title=None, sizes=30)


data = (times_series_vals,unique_series_ids)
helper_functions.save_object('processed_data.pkl', data)

pass
Exemplo n.º 31
0
def create_20ng_data(file_dir=""):
    newsgroups_train = datasets.fetch_20newsgroups(subset="train", remove=("headers", "footers", "quotes"))
    data = data_class.Data()
    short_names = [
        # 0
        "A",
        # 1-5
        "C1",
        "C2",
        "C3",
        "C4",
        "C5",
        # 6
        "M",
        # 7-10
        "R1",
        "R2",
        "R3",
        "R4",
        # 11-14
        "S1",
        "S2",
        "S3",
        "S4",
        # 15
        "O",
        # 16-19
        "T1",
        "T2",
        "T3",
        "T4",
    ]
    data.label_names = short_names
    y = newsgroups_train.target
    l = [1, 2, 7, 8, 12, 17]
    # l = [1,2,7,8,12,13]
    I = array_functions.false(len(newsgroups_train.target))
    for i in l:
        I = I | (y == i)
    # I = y == 1 | y == 2 | y == 7 | y == 7 | y == 11 | y == 16
    I = I.nonzero()[0]
    max_df = 0.95
    min_df = 0.001
    # max_df = .1
    # min_df = .01
    newsgroups_train.data = [newsgroups_train.data[i] for i in I]
    newsgroups_train.target = newsgroups_train.target[I]
    tf_idf = TfidfVectorizer(stop_words="english", max_df=max_df, min_df=min_df, max_features=max_features)
    vectors = tf_idf.fit_transform(newsgroups_train.data)
    feature_counts = (vectors > 0).sum(0)
    vocab = helper_functions.invert_dict(tf_idf.vocabulary_)
    num_feats = len(vocab)
    vocab = [vocab[i] for i in range(num_feats)]

    pca = PCA(n_components=pca_feats)
    v2 = pca.fit_transform(vectors.toarray())
    vectors = v2

    y = newsgroups_train.target.copy()
    """
    y[y==7] = 1
    y[(y==2) | (y==8)] = 2
    y[(y==12) | (y==17)] = 3
    """
    """
    y[y == 2] = 1
    y[(y==7) | (y==8)] = 2
    y[(y==12) | (y==13)] = 3
    #I_f = (y==1) | (y==7) | (y==11) | (y==16)
    I_f = array_functions.true(vectors.shape[0])
    f = f_classif
    k_best = SelectKBest(score_func=f, k=pca_feats)
    v2 = k_best.fit_transform(vectors[I_f,:], y[I_f])
    k_best.transform(vectors)
    s = k_best.get_support()
    selected_vocab = [vocab[i] for i in s.nonzero()[0]]
    vocab = selected_vocab
    vectors = v2
    """

    data.x = vectors
    data.y = newsgroups_train.target
    data.set_defaults()
    data.is_regression = False
    data.feature_names = vocab
    class_counts = array_functions.histogram_unique(data.y)
    s = ng_raw_data_file
    if file_dir != "":
        s = file_dir + "/" + s
    helper_functions.save_object(s, data)
Exemplo n.º 32
0
    def create_sampling_distribution(self, base_learner, data, fold_results):
        cluster_scale = self.cluster_scale
        source_learner = deepcopy(self.base_learner)
        source_data = data.get_transfer_subset(self.configs.source_labels)
        if source_data.n > 1000:
            source_data = source_data.rand_sample(.2)
            print 'subsampling source data: ' + str(source_data.n)
        if source_data.is_regression:
            source_data.data_set_ids[:] = self.configs.target_labels[0]
        else:
            source_data.change_labels(self.configs.source_labels,
                                      self.configs.target_labels)
        tic()
        source_learner.train_and_test(source_data)
        print 'train source time: ' + toc_str()
        target_data = data.get_transfer_subset(self.configs.target_labels,
                                               include_unlabeled=True)
        y_pred = source_learner.predict(data).y
        if self.use_oracle_target:
            target_learner = deepcopy(self.base_learner)
            oracle_target_data = deepcopy(target_data)
            oracle_target_data.y = oracle_target_data.true_y
            oracle_target_data.is_train[:] = True
            target_learner.train_and_test(oracle_target_data)
            y_pred_target = target_learner.predict(data).y
            y_pred = y_pred_target
        if self.use_oracle_labels:
            y_pred = data.true_y.copy()

        n_items = self.configs.active_items_per_iteration
        I = data.is_train
        if not self.use_warm_start:
            I &= ~data.is_labeled
        if self.configs.target_labels is not None:
            I &= data.get_transfer_inds(self.configs.target_labels)
        I = I.nonzero()[0]
        if self.max_items_for_instance_selection is not None and \
                        I.size > self.max_items_for_instance_selection:
            I = np.random.choice(I,
                                 self.max_items_for_instance_selection,
                                 replace=False)
            print 'subsampling target data: ' + str(I.size)

        labeled_target_data = deepcopy(data.get_subset(I))
        instances_to_keep = labeled_target_data.is_labeled
        labeled_target_data.set_train()
        labeled_target_data.is_noisy = array_functions.false(
            labeled_target_data.n)

        labeled_target_data.y = y_pred[I].copy()
        labeled_target_data.true_y = y_pred[I].copy()
        labeled_target_data.y_orig = y_pred[I].copy()
        labeled_target_data.instances_to_keep = instances_to_keep

        #labeled_target_data.y_orig = labeled_target_data.true_y.copy()
        if self.use_instance_selection:
            self.instance_selector.subset_size = n_items
            self.instance_selector.num_samples = n_items
            self.instance_selector.configs.use_validation = False
            self.instance_selector.configs.use_training = True
            self.instance_selector.train_and_test(labeled_target_data)
            is_selected = self.instance_selector.predict(
                labeled_target_data).is_selected
            scores = np.ones(is_selected.size)
            #Lower score is better
            scores[is_selected] = 0
            scores_sorted_inds = np.argsort(scores)
            print ''
        elif self.use_density:
            target_learner = deepcopy(self.base_learner)
            target_learner.train_and_test(labeled_target_data)
            vars = self.estimate_variance(
                target_learner,
                labeled_target_data,
            )
            densities = self.estimate_density(labeled_target_data)
        else:
            X_sub = data.x[I, :]
            tic()
            X_cluster_space, cluster_ids = self.create_clustering(
                X_sub,
                int(cluster_scale * self.configs.active_items_per_iteration))
            print 'cluster target time: ' + toc_str()
            vars, cluster_n = self.get_cluster_purity(
                cluster_ids, y_pred[I], not target_data.is_regression)
            true_vars, true_cluster_n = self.get_cluster_purity(
                cluster_ids, data.true_y[I], not target_data.is_regression)
            if self.use_target_variance:
                vars = true_vars
            centroid_idx = self.get_cluster_centroids(X_cluster_space)
            densities = cluster_n
        if self.use_instance_selection:
            pass
        else:
            scores = vars / densities
            scores_sorted_inds = np.argsort(scores)

        # Don't sample instances if cluster size is 1
        if not self.use_density and not self.use_instance_selection:
            scores[cluster_n <= .005 * I.size] = np.inf
            to_use = centroid_idx[scores_sorted_inds[:n_items]]
        else:
            to_use = scores_sorted_inds[:n_items]
        if self.transfer_hyperparameters:
            target_learner = deepcopy(self.base_learner)
            target_learner.configs.use_validation = True
            labeled_target_data.y[~is_selected] = np.nan

            target_learner.train_and_test(labeled_target_data)
            self.base_learner.base_learner.cv_params = {'unused': [0]}
            self.base_learner.base_learner.best_params = target_learner.base_learner.best_params
            self.base_learner.base_learner.set_params(
                **target_learner.base_learner.best_params)
        d = np.zeros(data.y.shape)
        d[I[to_use]] = 1
        d = d / d.sum()
        return d, d.size
Exemplo n.º 33
0
    '''
    for d in unique_dates:
        times_series_vals[d,i] = y[I[dates_idx == d]].mean()
    pass
    '''
    '''
    for j in I:
        print date_strs[j]
    '''
    '''
    print 'num_items: ' + str(I.size)
    print 'start: ' + date_strs[I[0]]
    print 'end: ' + date_strs[I[-1]]
    '''

has_loc = array_functions.false(unique_series_ids.size)
for i, id in enumerate(unique_series_ids):
    has_loc[i] = id in station_names
times_series_vals = times_series_vals[:, has_loc]
unique_series_ids = unique_series_ids[has_loc]
date_idx = 0
for i in range(0, num_days, 5):
    x = station_locs
    y = times_series_vals[i:120:28, :]
    array_functions.plot_heatmap(x, y.T, title=None, sizes=30)

data = (times_series_vals, unique_series_ids)
helper_functions.save_object('processed_data.pkl', data)

pass
Exemplo n.º 34
0
    d = datetime.date(year, month, day)
    return d

feat_names, data = create_data_set.load_csv(
    file_name,
    True,
    dtype='str',
    delim=',',
    #num_rows=40000
    num_rows=100000000000
)
y_names = [s + ' Mean' for s in ['NO2', 'O3', 'SO2', 'CO', ]]
y_inds = []
for name in y_names:
    y_inds.append(array_functions.find_first_element(feat_names, name))
to_keep = array_functions.false(data.shape[0])
date_strs = data[:, find_first_element(feat_names, 'Date Local')]
prev = ''
date_str_to_idx = dict()
date_ids = np.zeros(data.shape[0])
for i, date_str in enumerate(date_strs):
    date_obj = to_date(date_str)
    date_str_to_idx[date_str] = date_obj.toordinal()
    date_ids[i] = date_obj.toordinal()
    if prev != date_str:
        to_keep[i] = True
        prev = date_str
data = data[to_keep, :]
date_strs = date_strs[to_keep]
date_ids = date_ids.astype(np.int)
date_ids = date_ids[to_keep]