Python load_dataset примеры, utils.data_utils.load_dataset Python примеры использования

Пример #1

0

Показать файл

    def __init__(self, config):
        super(ProstateDistDvhDataLoader, self).__init__(config)

        # (self.X_train, self.y_train), (self.X_test, self.y_test) = mnist.load_data()
        #print "datapack_" + config.data_loader.organ + "_" + config.data_loader.x_name + "_2018.h5" 
        filename = get_datapack_filename(str(config.data_loader.organ), str(config.data_loader.x_name))
        self.X = load_dataset(config.data_loader.h5py_dir, filename, config.data_loader.x_groupname)
        self.y = load_dataset(config.data_loader.h5py_dir, filename, config.data_loader.y_groupname)
        config.data_loader.input_shape = self.X.shape[1:]

Пример #2

0

Показать файл

Файл: trainer.py Проект: deepmipt/tdl2

    def _init_dataloaders(self):
        self.train_ds = load_dataset(self.config['data_dir'], self.config.get('dataset_name', 'MNIST'), train=True)
        self.test_ds = load_dataset(self.config['data_dir'], self.config.get('dataset_name', 'MNIST'), train=False)
        
        num_good_points = self.config.get('num_good_points', len(self.train_ds))
        num_bad_points = self.config.get('num_bad_points', 0)

        self.train_ds = Subset(self.train_ds, range(num_good_points + num_bad_points))
        self.train_ds = spoil_dataset(self.train_ds, num_good_points, num_bad_points)
        
        self.train_dataloader = build_dataloader(self.train_ds, self.config['batch_size'], sequential=True)
        self.test_dataloader = build_dataloader(self.test_ds, self.config['batch_size'], sequential=True)

Пример #3

0

Показать файл

def load_data_h5(train_file_paths=None, test_file_paths=None, remap_config='Neo', orientation=preprocessor.ORIENTATION['coronal']):
    # Data splitting
    print("START")
    #if train_volumes and test_volumes:
        #train_file_paths = du.load_file_paths(data_dir, label_dir, train_volumes)
        #test_file_paths = du.load_file_paths(data_dir, label_dir, test_volumes)
    #else:
        #raise ValueError('You must provide a train, train dataset list')

    if train_file_paths:
        print("Train dataset size: %d" % (len(train_file_paths)))
        # loading,pre-processing and writing train data
        print("===Train data===")
        data_train, label_train, class_weights_train, weights_train, _ = du.load_dataset(train_file_paths,
                                                                                        orientation,
                                                                                        remap_config=remap_config,
                                                                                        return_weights=True,
                                                                                        reduce_slices=True,
                                                                                        remove_black=True)
        no_slices, H, W = data_train[0].shape
        data_train=np.concatenate(data_train).reshape((-1, H, W))
        label_train=np.concatenate(label_train).reshape((-1, H, W))
        class_weights_train=np.concatenate(class_weights_train).reshape((-1, H, W))
        
        print("END")  
  
        return (ImdbData(data_train, label_train, class_weights_train, transforms=transform_train))

    if test_file_paths:
        #_write_h5(data_train, label_train, class_weights_train, weights_train, f, mode='train')
        print("Test dataset size: %d" % (len(test_file_paths)))
        # loading,pre-processing and writing test data
        print("===Test data===")
        data_test, label_test, class_weights_test, weights_test, _ = du.load_dataset(test_file_paths,
                                                                                    orientation,
                                                                                    remap_config=remap_config,
                                                                                    return_weights=True,
                                                                                    reduce_slices=True,
                                                                                    remove_black=True)
        
        no_slices, H, W = data_test[0].shape
        data_test=np.concatenate(data_test).reshape((-1, H, W))
        label_test=np.concatenate(label_test).reshape((-1, H, W))
        class_weights_test=np.concatenate(class_weights_test).reshape((-1, H, W))
        
        print("END")  
  
        return (ImdbData(data_test, label_test, class_weights_test))
    else:
        raise ValueError('You must provide a train or test dataset list')

Пример #4

0

Показать файл

Файл: basic_feature_engineering.py Проект: SunnyMarkLiu/Kaggle_NYC_Taxi_Trip_Duration

def main():
    if os.path.exists(Configure.processed_train_path.format('1')):
        return

    train, test = data_utils.load_dataset(op_scope='0')
    print 'train: {}, test: {}'.format(train.shape, test.shape)
    trip_durations = train['trip_duration']
    del train['trip_duration']
    conbined_data = pd.concat([train, test])

    print 'generate geography pca features...'
    generate_pca_features(conbined_data)

    print 'generate datetime features...'
    generate_date_features(conbined_data)

    train = conbined_data.iloc[:train.shape[0], :]
    test = conbined_data.iloc[train.shape[0]:, :]

    print 'generate distance features...'
    generate_distance_features(train, test, loc1='latitude', loc2='longitude', fea_name='lat_long_')

    print 'generate pca distance features...'
    generate_distance_features(train, test, loc1='pca0', loc2='pca1', fea_name='pca_')

    print 'generate location bin features...'
    generate_location_bin_features(train, test, loc1='latitude', loc2='longitude',
                                   fea_name='lat_long_', round_num=2)

    train['trip_duration'] = trip_durations
    print 'train: {}, test: {}'.format(train.shape, test.shape)
    print 'save dataset...'
    data_utils.save_dataset(train, test, op_scope='1')

Пример #5

0

Показать файл

def main():
    train, test = data_utils.load_dataset(op_scope='4')
    print 'train: {}, test: {}'.format(train.shape, test.shape)
    trip_durations = train['trip_duration']
    del train['trip_duration']
    conbined_data = pd.concat([train, test])

    generate_binary_features(conbined_data)

    # for n_clusters in [6**2]:
    #     print 'location clustering n_clusters = {}...'.format(n_clusters)
    #     location_clustering(conbined_data, n_clusters=n_clusters, batch_size=64 ** 3, random_state=1000)
    #
    #     train = conbined_data.iloc[:train.shape[0], :]
    #     test = conbined_data.iloc[train.shape[0]:, :]
    #     train['trip_duration'] = trip_durations
    #
    #     print 'generate lat_long groupby speed features...'
    #     train, test = generate_groupby_speed_features(train, test, n_clusters, loc1='latitude', loc2='longitude',
    #                                                   fea_name='lat_long_')
    #     del train['trip_duration']
    #     print 'train: {}, test: {}'.format(train.shape, test.shape)
    #     conbined_data = pd.concat([train, test])

    train = conbined_data.iloc[:train.shape[0], :]
    test = conbined_data.iloc[train.shape[0]:, :]

    train['trip_duration'] = trip_durations
    print 'train: {}, test: {}'.format(train.shape, test.shape)
    print 'save dataset...'
    data_utils.save_dataset(train, test, op_scope='5')

Пример #6

0

Показать файл

Файл: data_cleaning.py Проект: SunnyMarkLiu/Kaggle_NYC_Taxi_Trip_Duration

def main():
    if os.path.exists(Configure.processed_train_path.format('2')):
        return

    train, test = data_utils.load_dataset(op_scope='1')
    print 'train: {}, test: {}'.format(train.shape, test.shape)
    print 'data clean according to lat_long_distance_haversine & trip_duration...'
    # train = train[train['lat_long_distance_haversine'] < 300]
    # train = train[train['trip_duration'] <= 1800000].reset_index(drop=True) # 导致过拟合

    print 'train: {}, test: {}'.format(train.shape, test.shape)

    # optimize dtypes
    print('Memory usage, Mb: {:.2f}'.format(train.memory_usage().sum() /
                                            2**20))
    print 'optimize dtypes...'
    train['is_store_and_fwd_flag'] = train['is_store_and_fwd_flag'].astype(
        np.uint8)
    train['passenger_count'] = train['passenger_count'].astype(np.uint8)
    train['vendor_id'] = train['vendor_id'].astype(np.uint8)
    train['pickup_month'] = train['pickup_month'].astype(np.uint8)
    train['pickup_day'] = train['pickup_day'].astype(np.uint8)
    train['pickup_hour'] = train['pickup_hour'].astype(np.uint8)
    train['pickup_weekofyear'] = train['pickup_weekofyear'].astype(np.uint8)
    train['pickup_weekday'] = train['pickup_weekday'].astype(np.uint8)
    train['is_weekend'] = train['is_weekend'].astype(np.uint8)
    train['trip_duration'] = train['trip_duration'].astype(np.uint32)
    print('After optimized memory usage, Mb: {:.2f}'.format(
        train.memory_usage().sum() / 2**20))

    print 'save dataset...'
    data_utils.save_dataset(train, test, op_scope='2')

Пример #7

0

Показать файл

Файл: perform_geography_clustering.py Проект: SunnyMarkLiu/Kaggle_NYC_Taxi_Trip_Duration

def main():
    if os.path.exists(Configure.processed_train_path.format('3')):
        return

    train, test = data_utils.load_dataset(op_scope='2')
    print 'train: {}, test: {}'.format(train.shape, test.shape)
    trip_durations = train['trip_duration']
    del train['trip_duration']
    conbined_data = pd.concat([train, test])

    n_clusters = 10**2
    print 'location clustering n_clusters = {}...'.format(n_clusters)
    location_clustering(conbined_data,
                        n_clusters=n_clusters,
                        batch_size=64**3,
                        random_state=1000)

    train = conbined_data.iloc[:train.shape[0], :]
    test = conbined_data.iloc[train.shape[0]:, :]
    train['trip_duration'] = trip_durations

    print 'generate lat_long groupby speed features...'
    train, test = generate_groupby_speed_features(train,
                                                  test,
                                                  n_clusters,
                                                  loc1='latitude',
                                                  loc2='longitude',
                                                  fea_name='lat_long_')
    # print 'generate pca groupby speed features...'
    # train, test = generate_groupby_speed_features(train, test, n_clusters, loc1='pca0', loc2='pca1', fea_name='pca_')

    print 'train: {}, test: {}'.format(train.shape, test.shape)
    print 'save dataset...'
    data_utils.save_dataset(train, test, op_scope='3')

Пример #8

0

Показать файл

def solve_opga(directory,
               name,
               depot,
               loc,
               prize,
               max_length,
               disable_cache=False):
    problem_filename = os.path.join(directory, "{}.opga.pkl".format(name))
    if os.path.isfile(problem_filename) and not disable_cache:
        (prize, tour, duration) = load_dataset(problem_filename)
    else:
        # 0 = start, 1 = end so add depot twice
        start = time.time()
        prize, tour, duration = run_opga_alg(
            [(*pos, p)
             for p, pos in zip([0, 0] + prize, [depot, depot] + loc)],
            max_length,
            return_sol=True,
            verbose=False)
        duration = time.time() - start  # Measure clock time
        save_dataset((prize, tour, duration), problem_filename)

    # First and last node are depot(s), so first node is 2 but should be 1 (as depot is 0) so subtract 1
    assert tour[0][3] == 0
    assert tour[-1][3] == 1
    return -prize, [i - 1 for x, y, p, i, t in tour[1:-1]], duration

Пример #9

0

Показать файл

def solve_ortools(directory, name, depot, loc, penalty, deterministic_prize, stochastic_prize,
                  sec_local_search=0, disable_cache=False):
    # Lazy import so we do not require ortools by default
    from .pctsp_ortools import solve_pctsp_ortools

    try:
        problem_filename = os.path.join(directory, "{}.ortools{}.pkl".format(name, sec_local_search))
        if os.path.isfile(problem_filename) and not disable_cache:
            objval, tour, duration = load_dataset(problem_filename)
        else:
            # 0 = start, 1 = end so add depot twice
            start = time.time()
            objval, tour = solve_pctsp_ortools(depot, loc, deterministic_prize, penalty,
                                               min(sum(deterministic_prize), 1.), sec_local_search=sec_local_search)
            duration = time.time() - start
            save_dataset((objval, tour, duration), problem_filename)
        assert tour[0] == 0, "Tour must start with depot"
        tour = tour[1:]
        total_cost = calc_pctsp_cost(depot, loc, penalty, deterministic_prize, tour)
        assert abs(total_cost - objval) <= 1e-5, "Cost is incorrect"
        return total_cost, tour, duration
    except Exception as e:
        # For some stupid reason, sometimes OR tools cannot find a feasible solution?
        # By letting it fail we do not get total results, but we dcan retry by the caching mechanism
        print("Exception occured")
        print(e)
        return None

Пример #10

0

Показать файл

def main():
    if os.path.exists(Configure.processed_train_path.format('8')):
        return

    train, test = data_utils.load_dataset(op_scope='7')
    print 'train: {}, test: {}'.format(train.shape, test.shape)
    trip_durations = train['trip_duration']
    del train['trip_duration']
    conbined_data = pd.concat([train, test])

    def driving_distance(raw):
        startpoint = (raw['pickup_latitude'], raw['pickup_longitude'])
        endpoint = (raw['dropoff_latitude'], raw['dropoff_longitude'])
        distance = great_circle(startpoint, endpoint).miles
        return distance

    print 'calc geopy distance features...'
    conbined_data['osmnx_distance'] = conbined_data[[
        'pickup_latitude', 'pickup_longitude', 'dropoff_latitude',
        'dropoff_longitude'
    ]].apply(driving_distance, axis=1)

    train = conbined_data.iloc[:train.shape[0], :]
    test = conbined_data.iloc[train.shape[0]:, :]
    train['trip_duration'] = trip_durations

    print 'train: {}, test: {}'.format(train.shape, test.shape)
    print 'save dataset...'
    data_utils.save_dataset(train, test, op_scope='8')

Пример #11

0

Показать файл

def main():
    train, test = data_utils.load_dataset(op_scope='5')
    print 'train: {}, test: {}'.format(train.shape, test.shape)
    print 'add fastest routes features...'
    train_fr_1 = pd.read_csv('../input/fastest_routes_train_part_1.csv')
    train_fr_2 = pd.read_csv('../input/fastest_routes_train_part_2.csv')
    test_fr = pd.read_csv('../input/fastest_routes_test.csv')

    train_fr = pd.concat((train_fr_1, train_fr_2))

    train = train.merge(train_fr, how='left', on='id')
    test = test.merge(test_fr, how='left', on='id')

    generate_street_heavy(train, test)

    train.drop([
        'starting_street', 'end_street', 'street_for_each_step',
        'distance_per_step', 'travel_time_per_step', 'step_maneuvers',
        'step_direction', 'step_location_list'
    ],
               axis=1,
               inplace=True)
    test.drop([
        'starting_street', 'end_street', 'street_for_each_step',
        'distance_per_step', 'travel_time_per_step', 'step_maneuvers',
        'step_direction', 'step_location_list'
    ],
              axis=1,
              inplace=True)

    print 'add weather features...'
    train, test = add_weather_features(train, test)
    print 'train: {}, test: {}'.format(train.shape, test.shape)
    print 'save dataset...'
    data_utils.save_dataset(train, test, op_scope='6')

Пример #12

0

Показать файл

Файл: final_feature_engineering.py Проект: SunnyMarkLiu/Kaggle_NYC_Taxi_Trip_Duration

def main():
    if os.path.exists(Configure.processed_train_path.format('7')):
        return

    train, test = data_utils.load_dataset(op_scope='6')
    print 'train: {}, test: {}'.format(train.shape, test.shape)
    trip_durations = train['trip_duration']
    del train['trip_duration']
    conbined_data = pd.concat([train, test])

    conbined_data['is_holyday'] = conbined_data.apply(
        lambda row: 1
        if (row['pickup_month'] == 1 and row['pickup_day'] == 1) or
        (row['pickup_month'] == 7 and row['pickup_day'] == 4) or
        (row['pickup_month'] == 11 and row['pickup_day'] == 11) or
        (row['pickup_month'] == 12 and row['pickup_day'] == 25) or
        (row['pickup_month'] == 1 and row['pickup_day'] >= 15 and row[
            'pickup_day'] <= 21 and row['pickup_weekday'] == 0) or
        (row['pickup_month'] == 2 and row['pickup_day'] >= 15 and row[
            'pickup_day'] <= 21 and row['pickup_weekday'] == 0) or
        (row['pickup_month'] == 5 and row['pickup_day'] >= 25 and row[
            'pickup_day'] <= 31 and row['pickup_weekday'] == 0) or
        (row['pickup_month'] == 9 and row['pickup_day'] >= 1 and row[
            'pickup_day'] <= 7 and row['pickup_weekday'] == 0) or
        (row['pickup_month'] == 10 and row['pickup_day'] >= 8 and row[
            'pickup_day'] <= 14 and row['pickup_weekday'] == 0) or
        (row['pickup_month'] == 11 and row['pickup_day'] >= 22 and row[
            'pickup_day'] <= 28 and row['pickup_weekday'] == 3) else 0,
        axis=1)
    conbined_data['is_day_before_holyday'] = conbined_data.apply(
        lambda row: 1
        if (row['pickup_month'] == 12 and row['pickup_day'] == 31) or
        (row['pickup_month'] == 7 and row['pickup_day'] == 3) or
        (row['pickup_month'] == 11 and row['pickup_day'] == 10) or
        (row['pickup_month'] == 12 and row['pickup_day'] == 24) or
        (row['pickup_month'] == 1 and row['pickup_day'] >= 14 and row[
            'pickup_day'] <= 20 and row['pickup_weekday'] == 6) or
        (row['pickup_month'] == 2 and row['pickup_day'] >= 14 and row[
            'pickup_day'] <= 20 and row['pickup_weekday'] == 6) or
        (row['pickup_month'] == 5 and row['pickup_day'] >= 24 and row[
            'pickup_day'] <= 30 and row['pickup_weekday'] == 6) or (
                (row['pickup_month'] == 9 and row['pickup_day'] >= 1 and row[
                    'pickup_day'] <= 6) or (row['pickup_month'] == 8 and row[
                        'pickup_day'] == 31) and row['pickup_weekday'] == 6) or
        (row['pickup_month'] == 10 and row['pickup_day'] >= 7 and row[
            'pickup_day'] <= 13 and row['pickup_weekday'] == 6) or
        (row['pickup_month'] == 11 and row['pickup_day'] >= 21 and row[
            'pickup_day'] <= 27 and row['pickup_weekday'] == 2) else 0,
        axis=1)

    train = conbined_data.iloc[:train.shape[0], :]
    test = conbined_data.iloc[train.shape[0]:, :]
    train['trip_duration'] = trip_durations

    print 'train: {}, test: {}'.format(train.shape, test.shape)
    print 'save dataset...'
    data_utils.save_dataset(train, test, op_scope='7')

Пример #13

0

Показать файл

Файл: vrp_baseline.py Проект: fmxFranky/attention-learn-to-route

def solve_lkh_log(executable,
                  directory,
                  name,
                  depot,
                  loc,
                  demand,
                  capacity,
                  grid_size=1,
                  runs=1,
                  disable_cache=False):

    problem_filename = os.path.join(directory,
                                    "{}.lkh{}.vrp".format(name, runs))
    tour_filename = os.path.join(directory, "{}.lkh{}.tour".format(name, runs))
    output_filename = os.path.join(directory,
                                   "{}.lkh{}.pkl".format(name, runs))
    param_filename = os.path.join(directory, "{}.lkh{}.par".format(name, runs))
    log_filename = os.path.join(directory, "{}.lkh{}.log".format(name, runs))

    try:
        # May have already been run
        if os.path.isfile(output_filename) and not disable_cache:
            tour, duration = load_dataset(output_filename)
        else:
            write_vrplib(problem_filename,
                         depot,
                         loc,
                         demand,
                         capacity,
                         grid_size,
                         name=name)

            params = {
                "PROBLEM_FILE": problem_filename,
                "OUTPUT_TOUR_FILE": tour_filename,
                "RUNS": runs,
                "SEED": 1234
            }
            write_lkh_par(param_filename, params)

            with open(log_filename, 'w') as f:
                start = time.time()
                check_call([executable, param_filename], stdout=f, stderr=f)
                duration = time.time() - start

            tour = read_vrplib(tour_filename, n=len(demand))

            save_dataset((tour, duration), output_filename)

        return calc_vrp_cost(depot, loc, tour), tour, duration

    except Exception as e:
        raise
        print("Exception occured")
        print(e)
        return None

Пример #14

0

Показать файл

def solve_salesman(directory,
                   name,
                   depot,
                   loc,
                   penalty,
                   deterministic_prize,
                   stochastic_prize,
                   runs=10):

    problem_filename = os.path.join(directory,
                                    "{}.salesman{}.pctsp".format(name, runs))
    output_filename = os.path.join(directory,
                                   "{}.salesman{}.pkl".format(name, runs))

    try:
        # May have already been run
        if not os.path.isfile(output_filename):
            write_pctsp(problem_filename,
                        depot,
                        loc,
                        penalty,
                        deterministic_prize,
                        name=name)

            start = time.time()

            random.seed(1234)
            pctsp = Pctsp()
            pctsp.load(problem_filename, float_to_scaled_int(1.))
            s = solution.random(pctsp, start_size=int(len(pctsp.prize) * 0.7))
            s = ilocal_search(s, n_runs=runs)

            output = (s.route[:s.size], s.quality)

            duration = time.time() - start

            save_dataset((output, duration), output_filename)
        else:
            output, duration = load_dataset(output_filename)

        # Now parse output
        tour = output[0][:]
        assert tour[0] == 0, "Tour should start with depot"
        assert tour[-1] != 0, "Tour should not end with depot"
        tour = tour[1:]  # Strip off depot

        total_cost = calc_pctsp_cost(depot, loc, penalty, deterministic_prize,
                                     tour)
        assert (float_to_scaled_int(total_cost) - output[1]) / float(
            output[1]) < 1e-5
        return total_cost, tour, duration
    except Exception as e:
        print("Exception occured")
        print(e)
        return None

Пример #15

0

Показать файл

def solve_gurobi(directory,
                 name,
                 depot,
                 loc,
                 penalty,
                 deterministic_prize,
                 stochastic_prize,
                 disable_cache=False,
                 timeout=None,
                 gap=None):
    # Lazy import so we do not need to have gurobi installed to run this script
    from .pctsp_gurobi import \
        solve_euclidian_pctsp as solve_euclidian_pctsp_gurobi

    try:
        problem_filename = os.path.join(
            directory, "{}.gurobi{}{}.pkl".format(
                name, "" if timeout is None else "t{}".format(timeout),
                "" if gap is None else "gap{}".format(gap)))

        if os.path.isfile(problem_filename) and not disable_cache:
            (cost, tour, duration) = load_dataset(problem_filename)
        else:
            # 0 = start, 1 = end so add depot twice
            start = time.time()

            # Must collect 1 or the sum of the prices if it is less then 1.
            cost, tour = solve_euclidian_pctsp_gurobi(
                depot,
                loc,
                penalty,
                deterministic_prize,
                min(sum(deterministic_prize), 1.),
                threads=1,
                timeout=timeout,
                gap=gap)
            duration = time.time() - start  # Measure clock time
            save_dataset((cost, tour, duration), problem_filename)

        # First and last node are depot(s), so first node is 2 but should be 1 (as depot is 0) so subtract 1
        assert tour[0] == 0
        tour = tour[1:]

        total_cost = calc_pctsp_cost(depot, loc, penalty, deterministic_prize,
                                     tour)
        assert abs(total_cost - cost) <= 1e-5, "Cost is incorrect"
        return total_cost, tour, duration

    except Exception as e:
        # For some stupid reason, sometimes OR tools cannot find a feasible solution?
        # By letting it fail we do not get total results, but we can retry by the caching mechanism
        print("Exception occured")
        print(e)
        return None

Пример #16

0

Показать файл

    def __init__(self,
                 num_nodes,
                 num_neighbors,
                 batch_size,
                 filepath,
                 target_filepath=None,
                 do_shuffle=False,
                 do_prep=True):
        """
        Args:
            num_nodes: Number of nodes in TSP tours
            num_neighbors: Number of neighbors to consider for each node in graph
            batch_size: Batch size
            filepath: Path to dataset file (.txt file)
        """
        self.num_nodes = num_nodes
        self.num_neighbors = num_neighbors
        self.batch_size = batch_size
        self.filepath = filepath
        filedata = load_dataset(filepath)  # open(filepath, "r").readlines()

        self.target_filepath = target_filepath
        if target_filepath is not None:
            self.has_target = True
            target_filedata, parallelism = load_dataset(target_filepath)
            self.filedata = list([
                (inst, sol) for inst, sol in zip(filedata, target_filedata)
                if sol is not None
            ])
        else:
            self.has_target = False
            self.filedata = list([(inst, None) for inst in filedata])

        self.do_prep = do_prep

        if do_shuffle:
            self.shuffle()

        self.max_iter = (len(self.filedata) // batch_size)
        assert self.max_iter > 0, "Not enough instances ({}) for batch size ({})".format(
            len(self.filedata), batch_size)

Пример #17

0

Показать файл

Файл: lq_et_roof.py Проект: zhangzhu16/DataGame

def pre_train():
    train_all, test = load_dataset(0)
    train_all.fillna(-1, inplace=True)
    test.fillna(-1, inplace=True)

    y_train_all = train_all['orderType']
    id_train = train_all['userid']
    train_all.drop(['orderType'], axis=1, inplace=True)

    id_test = test['userid']

    print("train_all: ({}), test: ({})".format(train_all.shape, test.shape))
    return train_all, y_train_all, id_train, test, id_test

Пример #18

0

Показать файл

Файл: vrp_reader.py Проект: longhuang318/dpdp

    def __init__(self,
                 num_nodes,
                 num_neighbors,
                 batch_size,
                 filepath,
                 target_filepath=None,
                 do_shuffle=False):
        """
        Args:
            num_nodes: Number of nodes in VRP tours (excl depot)
            num_neighbors: Number of neighbors to consider for each node in graph
            batch_size: Batch size
            filepath: Path to dataset file (.txt file)
            variant: 'routes' to predict all edges for routes, 'clusters' to predict which nodes go together in clusters
        """
        self.num_nodes = num_nodes
        self.num_neighbors = num_neighbors
        self.batch_size = batch_size
        self.filepath = filepath
        filedata = load_dataset(filepath)

        self.target_filepath = target_filepath
        if target_filepath is not None:
            self.has_target = True
            target_filedata, parallelism = load_dataset(target_filepath)
            self.filedata = list([
                (inst, sol) for inst, sol in zip(filedata, target_filedata)
                if sol is not None
            ])
        else:
            self.has_target = False
            self.filedata = list([(inst, None) for inst in filedata])

        if do_shuffle:
            self.shuffle()

        self.max_iter = (len(self.filedata) // batch_size)
        assert self.max_iter > 0, "Not enough instances ({}) for batch size ({})".format(
            len(self.filedata), batch_size)

Пример #19

0

Показать файл

Файл: fit.py Проект: tony32769/ship_detection

def load_data(args):
    df = load_dataset()
    unique_img_ids = get_unique_img_ids(df)
    balanced_dataset = get_balanced_dataset(unique_img_ids)

    if args['drop_empty_images']:
        balanced_dataset = drop_empty_images(balanced_dataset)

    train_df, valid_df = get_train_val_datasets(df, balanced_dataset)
    valid_x, valid_y = split_validation_dataset(valid_df,
                                                classify=args['classify_mode'])

    return train_df, valid_x, valid_y

Пример #20

0

Показать файл

def solve_compass_log(executable,
                      directory,
                      name,
                      depot,
                      loc,
                      prize,
                      max_length,
                      disable_cache=False):

    problem_filename = os.path.join(directory, "{}.oplib".format(name))
    tour_filename = os.path.join(directory, "{}.tour".format(name))
    output_filename = os.path.join(directory, "{}.compass.pkl".format(name))
    log_filename = os.path.join(directory, "{}.log".format(name))

    try:
        # May have already been run
        if os.path.isfile(output_filename) and not disable_cache:
            tour, duration = load_dataset(output_filename)
        else:
            write_oplib(problem_filename,
                        depot,
                        loc,
                        prize,
                        max_length,
                        name=name)

            with open(log_filename, 'w') as f:
                start = time.time()
                check_call([
                    executable, '--op', '--op-ea4op', problem_filename, '-o',
                    tour_filename
                ],
                           stdout=f,
                           stderr=f)
                duration = time.time() - start

            tour = read_oplib(tour_filename, n=len(prize))
            if not calc_op_length(depot, loc, tour) <= max_length:
                print("Warning: length exceeds max length:",
                      calc_op_length(depot, loc, tour), max_length)
            assert calc_op_length(
                depot, loc, tour
            ) <= max_length + MAX_LENGTH_TOL, "Tour exceeds max_length!"
            save_dataset((tour, duration), output_filename)

        return -calc_op_total(prize, tour), tour, duration

    except Exception as e:
        print("Exception occured")
        print(e)
        return None

Пример #21

0

Показать файл

def solve_gurobi(directory,
                 name,
                 depot,
                 loc,
                 prize,
                 max_length,
                 disable_cache=False,
                 timeout=None,
                 gap=None):
    # Lazy import so we do not need to have gurobi installed to run this script
    from problems.op.op_gurobi import \
        solve_euclidian_op as solve_euclidian_op_gurobi

    try:
        problem_filename = os.path.join(
            directory, "{}.gurobi{}{}.pkl".format(
                name, "" if timeout is None else "t{}".format(timeout),
                "" if gap is None else "gap{}".format(gap)))

        if os.path.isfile(problem_filename) and not disable_cache:
            (cost, tour, duration) = load_dataset(problem_filename)
        else:
            # 0 = start, 1 = end so add depot twice
            start = time.time()

            cost, tour = solve_euclidian_op_gurobi(depot,
                                                   loc,
                                                   prize,
                                                   max_length,
                                                   threads=1,
                                                   timeout=timeout,
                                                   gap=gap)
            duration = time.time() - start  # Measure clock time
            save_dataset((cost, tour, duration), problem_filename)

        # First and last node are depot(s), so first node is 2 but should be 1 (as depot is 0) so subtract 1
        assert tour[0] == 0
        tour = tour[1:]
        assert calc_op_length(
            depot, loc,
            tour) <= max_length + MAX_LENGTH_TOL, "Tour exceeds max_length!"
        total_cost = -calc_op_total(prize, tour)
        assert abs(total_cost - cost) <= 1e-4, "Cost is incorrect"
        return total_cost, tour, duration

    except Exception as e:
        # For some stupid reason, sometimes OR tools cannot find a feasible solution?
        # By letting it fail we do not get total results, but we dcan retry by the caching mechanism
        print("Exception occured")
        print(e)
        return None

Пример #22

0

Показать файл

Файл: preprocess_cleaning.py Проект: SunnyMarkLiu/Kaggle_Quora_Question_Pairs_Intent

def main(base_data_dir):
    op_scope = 0
    if os.path.exists(Configure.processed_train_path.format(base_data_dir, op_scope+1)):
        return

    print("---> load datasets from scope {}".format(op_scope))
    train, test = data_utils.load_dataset(base_data_dir, op_scope)
    print("train: {}, test: {}".format(train.shape, test.shape))
    print("---> generate basic statistic features")
    train['num_of_chars_q1'] = train['question1'].apply(lambda x: len(str(x)))
    train['num_of_chars_q2'] = train['question2'].apply(lambda x: len(str(x)))
    test['num_of_chars_q1'] = test['question1'].apply(lambda x: len(str(x)))
    test['num_of_chars_q2'] = test['question2'].apply(lambda x: len(str(x)))

    train['num_of_words_q1'] = train['question1'].apply(lambda x: len(str(x).split()))
    train['num_of_words_q2'] = train['question2'].apply(lambda x: len(str(x).split()))
    test['num_of_words_q1'] = test['question1'].apply(lambda x: len(str(x).split()))
    test['num_of_words_q2'] = test['question2'].apply(lambda x: len(str(x).split()))

    print('---> generate unigram_words features before cleaned')
    train = jobs.parallelize_dataframe(train, generate_unigram_words_features)
    test = jobs.parallelize_dataframe(test, generate_unigram_words_features)

    print('---> clean text')
    start = time.clock()
    if 'no_stem_words' in base_data_dir:
        print('clean train question')
        train = jobs.parallelize_dataframe(train, clean_text_func_no_stem_words)
        print('clean test question')
        test = jobs.parallelize_dataframe(test, clean_text_func_no_stem_words)
    else:
        print('clean train question')
        train = jobs.parallelize_dataframe(train, clean_text_func_stem_words)
        print('clean test question')
        test = jobs.parallelize_dataframe(test, clean_text_func_stem_words)

    stop = time.clock()
    print("text cleaned, cost {}s".format(stop, str(stop - start)))

    print('---> generate unigram_words features after cleaned')
    train = jobs.parallelize_dataframe(train, generate_cleaned_unigram_words_features)
    test = jobs.parallelize_dataframe(test, generate_cleaned_unigram_words_features)

    print("train: {}, test: {}".format(train.shape, test.shape))
    print("---> save datasets")
    data_utils.save_dataset(base_data_dir, train, test, op_scope + 1)

Пример #23

0

Показать файл

Файл: generate_magic_features.py Проект: SunnyMarkLiu/Kaggle_Quora_Question_Pairs_Intent

def main(base_data_dir):
    op_scope = 4
    if os.path.exists(
            Configure.processed_train_path.format(base_data_dir,
                                                  op_scope + 1)):
        return

    print("---> load datasets from scope {}".format(op_scope))
    train, test = data_utils.load_dataset(base_data_dir, op_scope)
    print("train: {}, test: {}".format(train.shape, test.shape))

    print('---> generate common word count')
    train = jobs.parallelize_dataframe(train, generate_common_word_count)
    test = jobs.parallelize_dataframe(test, generate_common_word_count)

    print("train: {}, test: {}".format(train.shape, test.shape))
    print("---> save datasets")
    data_utils.save_dataset(base_data_dir, train, test, op_scope + 1)

Пример #24

0

Показать файл

Файл: train_test_preprocess.py Проект: SunnyMarkLiu/Kaggle_NYC_Taxi_Trip_Duration

def main():
    if os.path.exists(Configure.processed_train_path.format('0')):
        return

    train, test = data_utils.load_dataset(op_scope='0')
    print 'train: {}, test: {}'.format(train.shape, test.shape)

    # store_and_fwd_flag
    train['is_store_and_fwd_flag'] = train['store_and_fwd_flag'].map(
        lambda s: 1 if s == 'Y' else 0)
    test['is_store_and_fwd_flag'] = test['store_and_fwd_flag'].map(
        lambda s: 1 if s == 'Y' else 0)
    del train['store_and_fwd_flag']
    del test['store_and_fwd_flag']

    print 'train: {}, test: {}'.format(train.shape, test.shape)
    print 'save dataset...'
    data_utils.save_dataset(train, test, op_scope='0')

Пример #25

0

Показать файл

Файл: compute_levenshtein_distances.py Проект: tartaruszen/wfes

def run(traces, outfname):
    X, Y, W, _, _ = load_dataset(traces)

    sizes = encode_sizes(X)

    log('Computing pairwise distances')
    D = pairwise_levenshtein_distances(sizes)
    log('Computing subtractions')

    log('Storing distances into {}'.format(outfname))

    data = {
        'webpage-id': W,
        'label': np.array(Y),
        'pairdist': D,
    }

    with open(outfname, 'wb') as f:
        dill.dump(data, f)

Пример #26

0

Показать файл

def solve_pctsp_log(executable, directory, name, depot, loc, penalty, deterministic_prize, stochastic_prize, runs=10):

    problem_filename = os.path.join(directory, "{}.pctsp{}.pctsp".format(name, runs))
    output_filename = os.path.join(directory, "{}.pctsp{}.pkl".format(name, runs))
    log_filename = os.path.join(directory, "{}.pctsp{}.log".format(name, runs))

    try:
        # May have already been run
        if not os.path.isfile(output_filename):
            write_pctsp(problem_filename, depot, loc, penalty, deterministic_prize, name=name)
            with open(log_filename, 'w') as f:
                start = time.time()
                output = check_output(
                    # exe, filename, min_total_prize (=1), num_runs
                    [executable, problem_filename, float_to_scaled_int_str(1.), str(runs)],
                    stderr=f
                ).decode('utf-8')
                duration = time.time() - start
                f.write(output)

            save_dataset((output, duration), output_filename)
        else:
            output, duration = load_dataset(output_filename)

        # Now parse output
        tour = None
        for line in output.splitlines():
            heading = "Best Result Route: "
            if line[:len(heading)] == heading:
                tour = np.array(line[len(heading):].split(" ")).astype(int)
                break
        assert tour is not None, "Could not find tour in output!"

        assert tour[0] == 0, "Tour should start with depot"
        assert tour[-1] == 0, "Tour should end with depot"
        tour = tour[1:-1]  # Strip off depot

        return calc_pctsp_cost(depot, loc, penalty, deterministic_prize, tour), tour.tolist(), duration
    except Exception as e:
        print("Exception occured")
        print(e)
        return None

Пример #27

0

Показать файл

def main():
    train, test = data_utils.load_dataset(op_scope='4')
    print 'train: {}, test: {}'.format(train.shape, test.shape)
    trip_durations = train['trip_duration']
    del train['trip_duration']
    conbined_data = pd.concat([train, test])
    conbined_data.columns = test.columns.values
    conbined_data.index = range(conbined_data.shape[0])

    # timewindow size in minutes
    timewindow_days = [10, 15]
    conbined_data = perform_time_window(conbined_data, timewindow_days)

    train = conbined_data.iloc[:train.shape[0], :]
    test = conbined_data.iloc[train.shape[0]:, :]

    train['trip_duration'] = trip_durations
    print 'train: {}, test: {}'.format(train.shape, test.shape)
    print 'save dataset...'
    data_utils.save_dataset(train, test, op_scope='5')

Пример #28

0

Показать файл

def solve_concorde_log(executable, directory, name, loc, disable_cache=False):

    problem_filename = os.path.join(directory, "{}.tsp".format(name))
    tour_filename = os.path.join(directory, "{}.tour".format(name))
    output_filename = os.path.join(directory, "{}.concorde.pkl".format(name))
    log_filename = os.path.join(directory, "{}.log".format(name))

    # if True:
    try:
        # May have already been run
        if os.path.isfile(output_filename) and not disable_cache:
            tour, duration = load_dataset(output_filename)
        else:
            write_tsplib(problem_filename, loc, name=name)

            with open(log_filename, 'w') as f:
                start = time.time()
                try:
                    # Concorde is weird, will leave traces of solution in current directory so call from target dir
                    check_call([
                        executable, '-s', '1234', '-x', '-o',
                        os.path.abspath(tour_filename),
                        os.path.abspath(problem_filename)
                    ],
                               stdout=f,
                               stderr=f,
                               cwd=directory)
                except CalledProcessError as e:
                    # Somehow Concorde returns 255
                    assert e.returncode == 255
                duration = time.time() - start

            tour = read_concorde_tour(tour_filename)
            save_dataset((tour, duration), output_filename)

        return calc_tsp_length(loc, tour), tour, duration

    except Exception as e:
        print("Exception occured")
        print(e)
        return None

Пример #29

0

Показать файл

def main():
    if os.path.exists(Configure.processed_train_path.format('4')):
        return

    train, test = data_utils.load_dataset(op_scope='3')
    print 'train: {}, test: {}'.format(train.shape, test.shape)
    trip_durations = train['trip_duration']
    del train['trip_duration']
    conbined_data = pd.concat([train, test])

    drop_missing_rate = 1
    print 'drop some features, missing_rate > {}'.format(drop_missing_rate)
    conbined_data = drop_some_features(conbined_data, drop_missing_rate=drop_missing_rate)

    train = conbined_data.iloc[:train.shape[0], :]
    test = conbined_data.iloc[train.shape[0]:, :]
    train['trip_duration'] = trip_durations

    print 'train: {}, test: {}'.format(train.shape, test.shape)
    print 'save dataset...'
    data_utils.save_dataset(train, test, op_scope='4')

Пример #30

0

Показать файл

def solve_ortools(directory,
                  name,
                  depot,
                  loc,
                  prize,
                  max_length,
                  sec_local_search=0,
                  disable_cache=False):
    # Lazy import so we do not require ortools by default
    from problems.op.op_ortools import solve_op_ortools

    try:
        problem_filename = os.path.join(
            directory, "{}.ortools{}.pkl".format(name, sec_local_search))
        if os.path.isfile(problem_filename) and not disable_cache:
            objval, tour, duration = load_dataset(problem_filename)
        else:
            # 0 = start, 1 = end so add depot twice
            start = time.time()
            objval, tour = solve_op_ortools(depot,
                                            loc,
                                            prize,
                                            max_length,
                                            sec_local_search=sec_local_search)
            duration = time.time() - start
            save_dataset((objval, tour, duration), problem_filename)
        assert tour[0] == 0, "Tour must start with depot"
        tour = tour[1:]
        assert calc_op_length(
            depot, loc,
            tour) <= max_length + MAX_LENGTH_TOL, "Tour exceeds max_length!"
        assert abs(-calc_op_total(prize, tour) -
                   objval) <= 1e-5, "Cost is incorrect"
        return -calc_op_total(prize, tour), tour, duration
    except Exception as e:
        # For some stupid reason, sometimes OR tools cannot find a feasible solution?
        # By letting it fail we do not get total results, but we dcan retry by the caching mechanism
        print("Exception occured")
        print(e)
        return None

Python load_dataset примеры использования