Exemplo n.º 1
0
 def get_transfer_inds(self, labels_or_ids):
     if labels_or_ids is None:
         return array_functions.true(self.n)
     if self.is_regression:
         return array_functions.find_set(self.data_set_ids, labels_or_ids)
     else:
         return array_functions.find_set(self.true_y, labels_or_ids)
    def train_and_test(self, data):
        source_data = self.get_source_data(data)

        #Because source learner is probably fully labeled, make sure we're not using validation parameter tuning
        self.source_learner.configs.use_validation = False
        viz_mds = False
        if viz_mds:
            source_labels = self.configs.source_labels
            target_labels = self.configs.target_labels
            data.data_set_ids[:] = 0
            data.data_set_ids[array_functions.find_set(data.y,source_labels[0,:])] = 1
            data.data_set_ids[array_functions.find_set(data.y,source_labels[1,:])] = 2
            data.change_labels(source_labels,target_labels)
            array_functions.plot_MDS(data.x,data.true_y,data.data_set_ids)

        if self.train_source_learner:
            if self.use_stacking:
                self.source_learner.train_and_test(data)
            else:
                self.source_learner.train_and_test(source_data)

        data_copy = self._prepare_data(data,include_unlabeled=True)
        data_copy = data_copy.get_transfer_subset(self.configs.target_labels, include_unlabeled=True)
        data_copy = data_copy.get_subset(data_copy.is_target)
        return super(HypothesisTransfer, self).train_and_test(data_copy)
Exemplo n.º 3
0
    def train_and_test(self, data):
        source_order = self.configs.source_domain_order
        target_order = self.configs.target_domain_order
        #results = super(DomainModelShiftMethod, self).train_and_test(data_copy)

        source_to_keep = array_functions.find_set(data.data_set_ids, source_order)
        source_data = data.get_subset(source_to_keep)
        source_data.y = source_data.true_y
        source_configs = deepcopy(self.configs)
        source_configs.labels_to_keep = source_order
        source_configs.labels_to_not_sample = np.asarray([source_order[0]])
        source_configs.source_labels = np.asarray([source_order[0]])
        source_configs.target_labels = np.asarray([source_order[1]])

        source_transformation = local_transfer_methods.OffsetTransfer(source_configs)
        source_transformation.use_validation = True
        source_transformation.train_and_test(source_data)

        target_to_keep = array_functions.find_set(data.data_set_ids, [target_order[0]])
        target_data = data.get_subset(target_to_keep)
        target_data.reveal_labels(target_data.data_set_ids == target_order[0])
        target_configs = deepcopy(self.configs)
        target_configs.labels_to_keep = np.asarray([target_order[0]])
        target_configs.source_labels = np.asarray([])
        target_configs.target_labels = np.asarray([target_order[0]])

        offset_labels = source_transformation.predict(target_data).y
        target_data.y = offset_labels
        target_data.true_y = offset_labels
        self.target_learner = method.NadarayaWatsonMethod(target_configs)
        self.target_learner.use_validation = True
        self.target_learner.train_and_test(target_data)

        t = data.get_subset(data.data_set_ids == target_order[1])
        return super(DomainModelShiftMethod, self).train_and_test(t)
Exemplo n.º 4
0
 def get_transfer_subset(self,labels_or_ids,include_unlabeled=False):
     assert len(labels_or_ids) > 0
     if self.is_regression:
         inds = array_functions.find_set(self.data_set_ids,labels_or_ids)
         if not include_unlabeled:
             inds = inds & self.is_labeled
     else:
         inds = array_functions.find_set(self.y,labels_or_ids)
         if include_unlabeled:
             inds = inds | ~self.is_labeled
     return self.get_subset(inds)
Exemplo n.º 5
0
 def get_transfer_subset(self, labels_or_ids, include_unlabeled=False):
     try:
         labels_or_ids[0]
     except:
         labels_or_ids = np.asarray([labels_or_ids])
     assert len(labels_or_ids) > 0
     if self.is_regression:
         inds = array_functions.find_set(self.data_set_ids, labels_or_ids)
         if not include_unlabeled:
             inds = inds & self.is_labeled
     else:
         inds = array_functions.find_set(self.y, labels_or_ids)
         if include_unlabeled:
             inds = inds | ~self.is_labeled
     return self.get_subset(inds)
Exemplo n.º 6
0
def test_mnist():
    num_per_class = 50
    data = helper_functions.load_object('../data_sets/mnist/raw_data.pkl')
    classes_to_use = [0, 4, 8, 7]
    I = array_functions.find_set(data.y, classes_to_use)
    data = data.get_subset(I)
    to_keep = None
    for i in classes_to_use:
        inds = (data.y == i).nonzero()[0]
        I = np.random.choice(inds, size=num_per_class, replace=False)
        if to_keep is None:
            to_keep = I
        else:
            to_keep = np.concatenate((to_keep, I))
    data.change_labels([classes_to_use[1], classes_to_use[3]], [classes_to_use[0], classes_to_use[2]])
    data.change_labels([classes_to_use[0], classes_to_use[2]], [0, 1])
    data_test = data.get_subset(~to_keep)
    data = data.get_subset(to_keep)
    label_names = [
        str(classes_to_use[0]) + '+' + str(classes_to_use[1]),
        str(classes_to_use[2]) + '+' + str(classes_to_use[3]),
    ]

    #data = add_label_noise_cluster(data, num_neighbors=20)
    #data = add_label_noise(data, 20)
    test_methods(data.x, data.y, data_test.x, data_test.y, label_names, mnist=True)
Exemplo n.º 7
0
def create_stations(file):
    feat_names, data = create_data_set.load_csv(file,
                                                True,
                                                dtype='str',
                                                delim=',',
                                                num_rows=1000000000)
    names = data[:,
                 array_functions.find_first_element(feat_names, 'station_id')]
    locs = data[:, array_functions.find_set(feat_names, ['long', 'lat'])]
    return names, locs.astype(np.float)
Exemplo n.º 8
0
def create_stations(file):
    feat_names, data = create_data_set.load_csv(
        file,
        True,
        dtype='str',
        delim=',',
        num_rows=1000000000
    )
    names = data[:, array_functions.find_first_element(feat_names, 'station_id')]
    locs = data[:, array_functions.find_set(feat_names, ['long', 'lat'])]
    return names, locs.astype(np.float)
Exemplo n.º 9
0
 def get_split(self, i, num_labeled=None):
     if 'use_data_set_ids' not in self.__dict__:
         self.use_data_set_ids = True
     d = copy.deepcopy(self.data)
     split = self.splits[i]
     d.apply_split(split)
     if self.labels_to_keep is not None:
         #d = d.get_with_labels(self.labels_to_keep)
         d = d.get_transfer_subset(self.labels_to_keep)
     to_keep = None
     if hasattr(self, 'data_set_ids_to_keep'
                ) and self.data_set_ids_to_keep is not None:
         to_keep = array_functions.find_set(d.data_set_ids,
                                            self.data_set_ids_to_keep)
         d.y[to_keep] = np.nan
     if num_labeled is not None:
         if d.is_regression:
             target_labels = self.target_labels
             if target_labels is None:
                 target_labels = [0]
             for label in target_labels:
                 labeled_inds = np.nonzero(d.is_train & d.is_labeled)[0]
                 if self.use_data_set_ids and \
                                 d.data_set_ids is not None and \
                                 self.target_labels is not None:
                     #labeled_inds = np.nonzero(d.is_train & (d.data_set_ids == self.target_labels))[0]
                     labeled_inds = np.nonzero(d.is_train &
                                               (d.data_set_ids == label))[0]
                 if np.isfinite(num_labeled):
                     to_clear = labeled_inds[num_labeled:]
                     d.y[to_clear] = np.nan
                     d.y[d.is_test] = np.nan
         else:
             d.y = d.y.astype('float32')
             d.true_y = d.true_y.astype('float32')
             classes = d.classes
             for c in classes:
                 if c in self.labels_to_not_sample:
                     continue
                 class_inds_train = (d.y == c) & d.is_train
                 if to_keep is not None:
                     class_inds_train[to_keep] = False
                 class_inds_train = np.nonzero(class_inds_train)[0]
                 if np.isfinite(num_labeled):
                     assert len(class_inds_train) >= num_labeled
                     d.y[class_inds_train[num_labeled:]] = np.nan
                 class_inds_test = (d.y == c) & ~d.is_train
                 if to_keep is not None:
                     class_inds_test[to_keep] = False
                 class_inds_test = np.nonzero(class_inds_test)[0]
                 d.y[class_inds_test] = np.nan
     if to_keep is not None:
         d.y[to_keep] = d.true_y[to_keep]
     return d
Exemplo n.º 10
0
def create_forest_fires():
    months = {
        'jan': 1,
        'feb': 2,
        'mar': 3,
        'apr': 4,
        'may': 5,
        'jun': 6,
        'jul': 7,
        'aug': 8,
        'sep': 9,
        'oct': 10,
        'nov': 11,
        'dec': 12
    }
    days = {
        'sun': 1,
        'mon': 2,
        'tue': 3,
        'wed': 4,
        'thu': 5,
        'fri': 6,
        'sat': 7
    }
    #month_to_season = lambda x : (months[x]-1)/3
    month_to_season = lambda x: months[x]
    day_to_int = lambda x: days[x]
    file = 'forest_fires/forestfires.csv'
    converters = {2: month_to_season, 3: day_to_int}
    field_names, forest_data = load_csv(file,
                                        dtype='float',
                                        converters=converters)
    x = forest_data
    y = forest_data[:, -1]
    i = field_names == 'month'
    domain_ids = forest_data[:, i]
    months_to_use = np.asarray([6, 7, 8])
    #months_to_use = np.asarray([1,2,3,4,5,6,7,8,9,10,11,12])
    to_use = array_functions.find_set(domain_ids, months_to_use)
    x = x[to_use, :]
    y = y[to_use]
    domain_ids = domain_ids[to_use]
    x = x[:, 4:]
    field_names = field_names[4:]
    I = (y > 0) & (y < 700)
    x = x[I, :]
    y = y[I]
    domain_ids = domain_ids[I]

    from methods import method
    learner = method.NadarayaWatsonMethod()
    viz_features(x, y, domain_ids, field_names, learner=learner)
    pass
Exemplo n.º 11
0
    def get_split(self, i, num_labeled=None):
        if 'use_data_set_ids' not in self.__dict__:
            self.use_data_set_ids = True
        d = copy.deepcopy(self.data)
        split = self.splits[i]
        d.apply_split(split)
        if self.labels_to_keep is not None:
            #d = d.get_with_labels(self.labels_to_keep)
            d = d.get_transfer_subset(self.labels_to_keep)
        to_keep = None
        if hasattr(self, 'data_set_ids_to_keep') and self.data_set_ids_to_keep is not None:
            to_keep = array_functions.find_set(d.data_set_ids, self.data_set_ids_to_keep)
            d.y[to_keep] = np.nan
        if num_labeled is not None:
            if d.is_regression:
                labeled_inds = np.nonzero(d.is_train & d.is_labeled)[0]
                if self.use_data_set_ids and \
                                d.data_set_ids is not None and \
                                self.target_labels is not None:
                    labeled_inds = np.nonzero(d.is_train & (d.data_set_ids == self.target_labels))[0]
                to_clear = labeled_inds[num_labeled:]
                d.y[to_clear] = np.nan
                d.y[d.is_test] = np.nan
            else:
                d.y = d.y.astype('float32')
                d.true_y = d.true_y.astype('float32')
                classes = d.classes
                for c in classes:
                    if c in self.labels_to_not_sample:
                        continue
                    class_inds_train = (d.y==c) & d.is_train
                    if to_keep is not None:
                        class_inds_train[to_keep] = False
                    class_inds_train = np.nonzero(class_inds_train)[0]
                    assert len(class_inds_train) >= num_labeled

                    d.y[class_inds_train[num_labeled:]] = np.nan
                    class_inds_test = (d.y==c) & ~d.is_train
                    if to_keep is not None:
                        class_inds_test[to_keep] = False
                    class_inds_test = np.nonzero(class_inds_test)[0]
                    d.y[class_inds_test] = np.nan
        if to_keep is not None:
            d.y[to_keep] = d.true_y[to_keep]
        return d
Exemplo n.º 12
0
def create_forest_fires():
    months = {
        "jan": 1,
        "feb": 2,
        "mar": 3,
        "apr": 4,
        "may": 5,
        "jun": 6,
        "jul": 7,
        "aug": 8,
        "sep": 9,
        "oct": 10,
        "nov": 11,
        "dec": 12,
    }
    days = {"sun": 1, "mon": 2, "tue": 3, "wed": 4, "thu": 5, "fri": 6, "sat": 7}
    # month_to_season = lambda x : (months[x]-1)/3
    month_to_season = lambda x: months[x]
    day_to_int = lambda x: days[x]
    file = "forest_fires/forestfires.csv"
    converters = {2: month_to_season, 3: day_to_int}
    field_names, forest_data = load_csv(file, dtype="float", converters=converters)
    x = forest_data
    y = forest_data[:, -1]
    i = field_names == "month"
    domain_ids = forest_data[:, i]
    months_to_use = np.asarray([6, 7, 8])
    # months_to_use = np.asarray([1,2,3,4,5,6,7,8,9,10,11,12])
    to_use = array_functions.find_set(domain_ids, months_to_use)
    x = x[to_use, :]
    y = y[to_use]
    domain_ids = domain_ids[to_use]
    x = x[:, 4:]
    field_names = field_names[4:]
    I = (y > 0) & (y < 700)
    x = x[I, :]
    y = y[I]
    domain_ids = domain_ids[I]

    from methods import method

    learner = method.NadarayaWatsonMethod()
    viz_features(x, y, domain_ids, field_names, learner=learner)
    pass
    def train_and_test(self, data):
        assert data.is_regression
        if data.is_regression:
            source_data = data.get_transfer_subset(self.configs.source_labels.ravel(),include_unlabeled=False)
        else:
            source_data = data.get_transfer_subset(self.configs.source_labels.ravel(),include_unlabeled=False)
        source_data.set_target()
        source_data.set_train()
        source_data.reveal_labels(~source_data.is_labeled)
        if source_data.is_regression:
            source_data.data_set_ids[:] = self.configs.target_labels[0]
        if not data.is_regression:
            source_data.change_labels(self.configs.source_labels,self.configs.target_labels)
            source_data = source_data.rand_sample(.1)

        self.source_learner.train_and_test(source_data)

        data_copy = data.get_transfer_subset(self.configs.labels_to_keep, include_unlabeled=True)
        is_source = array_functions.find_set(data_copy.data_set_ids,self.configs.source_labels)
        data_copy.type[is_source] = data_lib.TYPE_SOURCE
        assert data_copy.is_source.any()
        return super(SMSTransfer, self).train_and_test(data_copy)
Exemplo n.º 14
0
     continue
 source_label = data.classes[source_idx]
 I = (data_copy.true_y == source_label) | (data_copy.true_y
                                           == base_label)
 data_source = data_copy.get_subset(I)
 source_results = learner.train_and_test(data_source)
 transfer_error[source_idx,
                source_idx] += source_results.error_on_test_data
 for target_idx in range(num_classes):
     if target_idx == source_idx or target_idx == base_class_idx:
         continue
     target_label = data.classes[target_idx]
     if use_transfer:
         all_labels = np.asarray(
             [base_label, target_label, source_label])
         I = array_functions.find_set(data_copy.true_y,
                                      np.asarray(all_labels))
         data_target = data_copy.get_subset(I)
         data_base = data_copy.get_subset(
             data_copy.true_y == base_label)
         # Create a new label to duplicate base data
         new_label = data.classes.max() + 1
         data_base.change_labels([base_label], [new_label])
         data_target.combine(data_base)
         data_target.data_set_ids = None
         transfer_learner.configs.source_labels = np.expand_dims(
             np.asarray([source_label, base_label]), 0)
         transfer_learner.configs.target_labels = np.asarray(
             [target_label, new_label])
         transfer_results = transfer_learner.train_and_test(
             data_target).prediction
     else:
Exemplo n.º 15
0
def load_trip_data(file_names,
                   y_names,
                   time_name,
                   loc_names,
                   resolution=np.asarray([20, 20]),
                   plot_data=True):
    resolution = np.asarray(resolution)
    feat_names = None
    data = None
    for file_name in file_names:
        curr_feat_names, curr_data = load_csv(file_name,
                                              True,
                                              dtype='str',
                                              delim=',',
                                              num_rows=1000000000)
        if feat_names is None:
            feat_names = curr_feat_names
            data = curr_data
            continue
        assert (feat_names == curr_feat_names).all()
        data = np.vstack((data, curr_data))
    locs = data[:, array_functions.find_set(feat_names, loc_names)]
    y_inds = None
    if y_names is not None:
        y_inds = array_functions.find_set(feat_names, y_names).nonzero()[0]
        y = data[:, y_inds].astype(np.float)
    else:
        y = np.ones(data.shape[0])
    date_strs = data[:, find_first_element(feat_names, time_name)]
    date_str_to_idx = dict()
    date_ids = np.zeros(data.shape[0])
    for i, date_str in enumerate(date_strs):
        date_obj = to_date(date_str)
        date_str_to_idx[date_str] = date_obj.toordinal()
        date_ids[i] = date_obj.toordinal()
    date_ids = date_ids.astype(np.int)

    min_date_id = date_ids.min()
    max_date_id = date_ids.max()
    num_days = max_date_id - min_date_id + 1
    dates_idx = date_ids - min_date_id
    num_locations = np.prod(resolution)
    trip_counts = np.zeros((num_days, num_locations))
    locs = locs.astype(np.float)
    p_min = .3
    p_max = .7
    is_in_range = array_functions.is_in_percentile(
        locs[:, 0], p_min, p_max) & array_functions.is_in_percentile(
            locs[:, 1], p_min, p_max)
    locs = locs[is_in_range, :]
    dates_idx = dates_idx[is_in_range]
    x_bins = quantize_loc(locs[:, 0], resolution[0])
    y_bins = quantize_loc(locs[:, 1], resolution[1])
    #array_functions.plot_2d(locs[I,0],locs[I,1])
    xy_bins = list(
        itertools.product(range(resolution[0]), range(resolution[1])))
    for x_idx, y_idx in xy_bins:
        is_in_cell = (x_bins == x_idx) & (y_bins == y_idx)
        trips_in_cell = dates_idx[is_in_cell]
        trip_dates, trips_per_date = np.unique(trips_in_cell,
                                               return_counts=True)
        bin_idx = bin_to_idx([x_idx, y_idx], resolution)
        trip_counts[trip_dates, bin_idx] = trips_per_date
    #y = trip_counts[[0, 3], :].T
    tuesday_saturday_idx = np.asarray([0, 4])
    first_tuesday_idx = np.asarray([0, 154])

    #y = trip_counts[first_tuesday_idx + 0, :].T
    '''
    y1 = trip_counts[:30,:].sum(0)
    y2 = trip_counts[154:, :].sum(0)
    '''
    y1 = trip_counts[3:30:7, :].mean(0)
    y2 = trip_counts[4:30:7, :].mean(0)
    y = np.stack((y1, y2), 1)

    #y[y > 100] = 0
    #y[y > 5000] = 0
    #y[y == y.max()] == 0
    y = np.log(y)
    if plot_data:
        array_functions.plot_heatmap(np.asarray(xy_bins), y, sizes=50)
    return np.asarray(xy_bins, dtype=np.float), y, np.asarray(
        [str(xy) for xy in xy_bins])
Exemplo n.º 16
0
 def get_with_labels_and_unlabeled(self,labels):
     inds = array_functions.find_set(self.true_y,labels) | ~self.is_labeled
     return self.get_subset(inds)
Exemplo n.º 17
0
 def get_data_set_ids(self, data_set_ids):
     inds = array_functions.find_set(self.data_set_ids,data_set_ids)
     return self.get_subset(inds)
Exemplo n.º 18
0
    )
    names = data[:, array_functions.find_first_element(feat_names, 'station_id')]
    locs = data[:, array_functions.find_set(feat_names, ['long', 'lat'])]
    return names, locs.astype(np.float)

station_names, station_locs = create_stations(station_file_name)

feat_names, data = create_data_set.load_csv(
    file_name,
    True,
    dtype='str',
    delim=',',
    num_rows=1000000000
)
y_names = ['tripduration']
y_inds = array_functions.find_set(feat_names, y_names).nonzero()[0]
date_strs = data[:, find_first_element(feat_names, 'starttime')]
date_str_to_idx = dict()
date_ids = np.zeros(data.shape[0])
for i, date_str in enumerate(date_strs):
    date_obj = to_date(date_str)
    date_str_to_idx[date_str] = date_obj.toordinal()
    date_ids[i] = date_obj.toordinal()
date_ids = date_ids.astype(np.int)
y = data[:, y_inds].astype(np.float)

#y_sub = y[I, :]

#series_id = data[:, find_first_element(feat_names, 'Site Num')].astype(np.int)
a1 = data[:, find_first_element(feat_names, 'from_station_id')].astype(np.str)
a2 = data[:, find_first_element(feat_names, 'to_station_id')].astype(np.str)
Exemplo n.º 19
0
 def get_data_set_ids(self, data_set_ids):
     inds = array_functions.find_set(self.data_set_ids, data_set_ids)
     return self.get_subset(inds)
Exemplo n.º 20
0
                                                num_rows=1000000000)
    names = data[:,
                 array_functions.find_first_element(feat_names, 'station_id')]
    locs = data[:, array_functions.find_set(feat_names, ['long', 'lat'])]
    return names, locs.astype(np.float)


station_names, station_locs = create_stations(station_file_name)

feat_names, data = create_data_set.load_csv(file_name,
                                            True,
                                            dtype='str',
                                            delim=',',
                                            num_rows=1000000000)
y_names = ['tripduration']
y_inds = array_functions.find_set(feat_names, y_names).nonzero()[0]
date_strs = data[:, find_first_element(feat_names, 'starttime')]
date_str_to_idx = dict()
date_ids = np.zeros(data.shape[0])
for i, date_str in enumerate(date_strs):
    date_obj = to_date(date_str)
    date_str_to_idx[date_str] = date_obj.toordinal()
    date_ids[i] = date_obj.toordinal()
date_ids = date_ids.astype(np.int)
y = data[:, y_inds].astype(np.float)

#y_sub = y[I, :]

#series_id = data[:, find_first_element(feat_names, 'Site Num')].astype(np.int)
a1 = data[:, find_first_element(feat_names, 'from_station_id')].astype(np.str)
a2 = data[:, find_first_element(feat_names, 'to_station_id')].astype(np.str)
Exemplo n.º 21
0
def selection_subset_ids(data, ids):
    return array_functions.find_set(data.data_set_ids, ids)
Exemplo n.º 22
0
def select_classes(data, classes):
    return array_functions.find_set(data.true_y, classes)
Exemplo n.º 23
0
 def has_label(self, labels):
     return array_functions.find_set(self.y, labels)
Exemplo n.º 24
0
 def get_with_labels(self, labels):
     inds = array_functions.find_set(self.true_y, labels)
     return self.get_subset(inds)
Exemplo n.º 25
0
 def has_label(self, labels):
     return array_functions.find_set(self.y, labels)
Exemplo n.º 26
0
 def get_with_labels_and_unlabeled(self, labels):
     inds = array_functions.find_set(self.true_y, labels) | ~self.is_labeled
     return self.get_subset(inds)
Exemplo n.º 27
0
 def get_with_labels(self,labels):
     inds = array_functions.find_set(self.true_y,labels)
     return self.get_subset(inds)
Exemplo n.º 28
0
def load_trip_data(file_names, y_names, time_name, loc_names, resolution=np.asarray([20, 20]), plot_data=True):
    resolution = np.asarray(resolution)
    feat_names = None
    data = None
    for file_name in file_names:
        curr_feat_names, curr_data = load_csv(file_name, True, dtype="str", delim=",", num_rows=1000000000)
        if feat_names is None:
            feat_names = curr_feat_names
            data = curr_data
            continue
        assert (feat_names == curr_feat_names).all()
        data = np.vstack((data, curr_data))
    locs = data[:, array_functions.find_set(feat_names, loc_names)]
    y_inds = None
    if y_names is not None:
        y_inds = array_functions.find_set(feat_names, y_names).nonzero()[0]
        y = data[:, y_inds].astype(np.float)
    else:
        y = np.ones(data.shape[0])
    date_strs = data[:, find_first_element(feat_names, time_name)]
    date_str_to_idx = dict()
    date_ids = np.zeros(data.shape[0])
    for i, date_str in enumerate(date_strs):
        date_obj = to_date(date_str)
        date_str_to_idx[date_str] = date_obj.toordinal()
        date_ids[i] = date_obj.toordinal()
    date_ids = date_ids.astype(np.int)

    min_date_id = date_ids.min()
    max_date_id = date_ids.max()
    num_days = max_date_id - min_date_id + 1
    dates_idx = date_ids - min_date_id
    num_locations = np.prod(resolution)
    trip_counts = 0 * np.ones((num_days, num_locations))
    locs = locs.astype(np.float)
    p_min = 0.3
    p_max = 0.7
    is_in_range = array_functions.is_in_percentile(locs[:, 0], p_min, p_max) & array_functions.is_in_percentile(
        locs[:, 1], p_min, p_max
    )
    locs = locs[is_in_range, :]
    dates_idx = dates_idx[is_in_range]
    x_bins = quantize_loc(locs[:, 0], resolution[0])
    y_bins = quantize_loc(locs[:, 1], resolution[1])
    # array_functions.plot_2d(locs[I,0],locs[I,1])
    xy_bins = list(itertools.product(range(resolution[0]), range(resolution[1])))
    for x_idx, y_idx in xy_bins:
        is_in_cell = (x_bins == x_idx) & (y_bins == y_idx)
        trips_in_cell = dates_idx[is_in_cell]
        trip_dates, trips_per_date = np.unique(trips_in_cell, return_counts=True)
        bin_idx = bin_to_idx([x_idx, y_idx], resolution)
        trip_counts[trip_dates, bin_idx] = trips_per_date
    # y = trip_counts[[0, 3], :].T
    tuesday_saturday_idx = np.asarray([0, 4])
    first_tuesday_idx = np.asarray([0, 154])

    # y = trip_counts[first_tuesday_idx + 0, :].T
    """
    y1 = trip_counts[:30,:].sum(0)
    y2 = trip_counts[154:, :].sum(0)
    """
    y1 = trip_counts[:30:7, :].mean(0)
    y2 = trip_counts[4:30:7, :].mean(0)
    y = np.stack((y1, y2), 1)

    # y[y > 100] = 0
    # y[y > 5000] = 0
    # y[y == y.max()] == 0
    y = np.log(y)
    if plot_data:
        array_functions.plot_heatmap(np.asarray(xy_bins), y, sizes=50)
    return np.asarray(xy_bins, dtype=np.float), y, np.asarray([str(xy) for xy in xy_bins])
Exemplo n.º 29
0
 def get_transfer_inds(self,labels_or_ids):
     if self.is_regression:
         return array_functions.find_set(self.data_set_ids,labels_or_ids)
     else:
         return array_functions.find_set(self.true_y,labels_or_ids)
Exemplo n.º 30
0
file_name = 'kc_house_data.csv'
save_data = True
sampled_size = 1000

feat_names, data = create_data_set.load_csv(file_name,
                                            True,
                                            dtype='str',
                                            delim=',')
y_name = 'price'
y_ind = array_functions.find_first_element(feat_names, y_name)
y = data[:, y_ind].astype(np.float)
y /= 100000
suffix = ''
if create_geospatial_data:
    x_feats = ['long', 'lat']
    x_feat_inds = array_functions.find_set(feat_names, x_feats)
    x = data[:, x_feat_inds]
    x = array_functions.remove_quotes(x)
    x = x.astype(np.float)

    x[:, 0] = array_functions.normalize(x[:, 0])
    x[:, 1] = array_functions.normalize(x[:, 1])
    I = array_functions.is_in_percentile(x[:, 0], .01, .99)
    I &= array_functions.is_in_percentile(x[:, 1], .01, .99)
    x = x[I, :]
    y = y[I]
    data = data[I, :]

    if split_date:
        dates = array_functions.remove_quotes(data[:, feat_names == 'date'])
        date_objs = []
Exemplo n.º 31
0
import numpy as np
from data_sets import create_data_set
from utility import array_functions
from utility import helper_functions

file_name = 'kc_house_data.csv'

feat_names, data = create_data_set.load_csv(file_name, True, dtype='str', delim=',')
feats_to_clear = ['id', 'date', 'yr_renovated', 'zipcode', 'lat', 'long']
y_name = 'price'
y_ind = array_functions.find_first_element(feat_names, y_name)
y = data[:, y_ind].astype(np.float)
y /= 100000
clear_idx = array_functions.find_set(feat_names, feats_to_clear + [y_name])
x = data[:, ~clear_idx]
x = array_functions.remove_quotes(x)
x = x.astype(np.float)

data = (x,y)
helper_functions.save_object('processed_data.pkl', data)

pass