def __init__(self, config): """ Constructor :param config: :return: """ self.config = config self.logger = logging.getLogger("so_logger") self.data_provider = DataProvider(self.config) self.data_exporter = DataExporter(self.config)
def run(self): """ Execute the job :param: :return: """ self.logger.info("Starting job: GuruDataProcessor\n") data_provider = DataProvider(self.config) data_exporter = DataExporter(self.config) # Read guru data df = data_provider.read_guru_user_data() df = df[[3, 4]] # Salary and skills columns df.columns = ['cost', 'skills'] df = df[(df.cost != "$0") & (df.skills != "UNKNOWN")] df = df.reset_index(drop=True) df = df.assign(user_id=df.index.values) df = df.assign(skills=df.apply(lambda x: x['skills'][:-1].split(','), axis=1)) # Convert cost to integers user_df = df.assign(cost=df.apply(lambda x: int(x['cost'][1:]), axis=1)) # Read skills data df = data_provider.read_guru_skill_data() df = df[[1]] df.columns = ['skill'] skill_df = df.assign(skill_id=df.index.values) # Create multilabel binarizer mlb = MultiLabelBinarizer(classes=skill_df.skill.values) # One hot encoding of user skills skills = mlb.fit_transform(user_df['skills']) # Create dataset users = user_df.to_dict('records') for i in range(len(users)): users[i]['skills_array'] = skills[i] # Export csv files data_exporter.export_csv_file(user_df, "guru/guru_user_df.csv") data_exporter.export_csv_file(skill_df, "guru/guru_skill_df.csv") # Scaling factor for submodular function scaling_factor = 1 # Create and export data object to be used in experiments # containing all methods related to guru data guru = GuruData(self.config, user_df, skill_df, users, scaling_factor) data_exporter.export_dill_file(guru, "guru/guru_data.dill") self.logger.info("Finished job: GuruDataProcessor")
def __init__(self, config_): """ Constructor :param config_: :return: """ self.config = config_ self.logger = logging.getLogger("cuda_logger") data_provider = DataProvider(self.config) filename = self.config['city_state_creator'].get( 'filename', 'city_states.dill') self.city_states = data_provider.read_city_states(filename) self.reg_models = data_provider.read_regression_models()
def __init__(self, config_, year, month, weekday): """ Constructor :param config_: :return: """ self.config = config_ self.year = year self.month = month self.weekday = weekday self.logger = logging.getLogger("cuda_logger") data_provider = DataProvider(self.config) hex_attr_df = data_provider.read_hex_bin_attributes() hex_attr_df['center'] = hex_attr_df.apply(self.calculate_bin_center, axis=1) self.hex_attr_df = hex_attr_df
def get_trips_data(self): """ Gets data for appropriate weekday and month :param: :return df: """ # Create SQL query query = """ \ SELECT \ *, HOUR(tpep_pickup_datetime) as pickup_hour\ FROM \ `yt-{0}-{1}` \ WHERE \ WEEKDAY(tpep_pickup_datetime) like {2} \ AND pickup_bin IS NOT NULL \ AND dropoff_bin IS NOT NULL \ AND pickup_bin != dropoff_bin; \ """ # Read query output and select required columns df = DataProvider.read_sql_query( query.format(self.month, self.year, self.weekday)) cols = [ 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'trip_distance', 'fare_amount', 'duration_seconds', 'pickup_bin', 'dropoff_bin', 'pickup_hour' ] df = df[cols] return df
def __init__(self, config_): """ Constructor :param config_: :returns: """ self.config = config_ self.logger = logging.getLogger("cuda_logger") self.start_time = self.config["city_state_creator"]["start_time"] self.end_time = self.config["city_state_creator"]["end_time"] self.time_slice_duration = self.config["city_state_creator"][ "time_slice_duration"] self.time_unit_duration = self.config["city_state_creator"][ "time_unit_duration"] data_provider = DataProvider(self.config) hex_attr_df = data_provider.read_hex_bin_attributes() hex_dist_df = data_provider.read_hex_bin_distances() self.hex_bins = hex_attr_df['hex_id'].values self.hex_dist = hex_dist_df[[ 'pickup_bin', 'dropoff_bin', 'straight_line_distance' ]]
def __init__(self, config_): """ Constructor :param config_: :return: """ self.config = config_ self.logger = logging.getLogger("gym_logger") data_provider = DataProvider(self.config) # City state parameters self.city_states = data_provider.read_city_states() self.hex_attr_df = data_provider.read_hex_bin_attributes() self.hex_bins = self.hex_attr_df['hex_id'] self.T = len(self.city_states) # Number of time steps self.S = len(self.hex_bins) # Number of hex bins # Environment parameters self.num_drivers = self.config['env_parameters']['num_drivers'] self.distribution = self.config['env_parameters'][ 'driver_distribution'] self.next_free_timestep = np.zeros( self.num_drivers) # Next free timestep for each driver self.total_driver_earnings = np.zeros( self.num_drivers) # Total earnings for each driver # Environment action and observation space actions = [7 for i in range(self.S)] drivers = [self.num_drivers for i in range(self.S)] self.action_space = spaces.MultiDiscrete(actions) # self.observation_space = spaces.Tuple(( # # spaces.Discrete(self.T), # Time step # spaces.MultiDiscrete(drivers) # Driver distribution # )) self.observation_space = spaces.MultiDiscrete(drivers) self.reset()
def run(self): """ This method executes the job :param: :return: """ self.logger.info("Starting job: NeighborhoodDataExportJob\n") data_provider = DataProvider(self.config) data_exporter = DataExporter(self.config) hex_attr_df = data_provider.read_hex_bin_attributes() hex_bins = hex_attr_df['hex_id'].values data = {} for r in xrange(self.radius + 1): data[r] = {} for hex_bin in hex_bins: neighbors = hex_neighborhood(hex_bin, hex_attr_df, r) zero_vector = np.zeros(len(hex_bins)) np.put(zero_vector, neighbors, 1) one_hot_encoding_vector = zero_vector data[r][hex_bin] = one_hot_encoding_vector data_exporter.export_neighborhood_data(data) self.logger.info("Finished job: NeighborhoodDataExportJob")
class BaselineDriver(object): def __init__(self, config): self.config = config self.logger = logging.getLogger("baseline_logger") self.data_provider = DataProvider(self.config) self.data_exporter = DataExporter(self.config) @staticmethod def run_baseline(baseline_config): baseline_name = baseline_config['name'] baseline_count = baseline_config['count'] config = baseline_config['config'] city_states = baseline_config['city_states'] episode_rewards = [] if baseline_name == "cDQN": baseline = cDQN(config) rewards = baseline.run(city_states) for _ in range(len(rewards)): episode_rewards.append({ 'agent': 'cDQN', 'episode': _, 'run': baseline_count, 'earnings': rewards[_] }) if baseline_name == "cA2C": baseline = cA2C(config) rewards = baseline.run(city_states) for _ in range(len(rewards)): episode_rewards.append({ 'agent': 'cA2C', 'episode': _, 'run': baseline_count, 'earnings': rewards[_] }) if baseline_name == "A2C": baseline = A2C(config) rewards = baseline.run(city_states) for _ in range(len(rewards)): episode_rewards.append({ 'agent': 'A2C', 'episode': _, 'run': baseline_count, 'earnings': rewards[_] }) return episode_rewards def run(self): self.logger.info("Starting baselines") city_states = self.data_provider.read_city_states() baseline_list = self.config['baselines']['baseline_list'] # Create a pool of processes num_processes = mp.cpu_count() self.logger.info("Processes: {}".format(num_processes)) pool = ProcessPool(nodes=num_processes) configs = [] for count in range(10): for name in baseline_list: configs.append({ 'name': name, 'count': count, 'config': self.config, 'city_states': city_states }) results = pool.amap(self.run_baseline, configs).get() pool.close() pool.join() pool.clear() episode_rewards = [] for result in results: episode_rewards += result self.data_exporter.export_baseline_data(episode_rewards) self.logger.info("Finished baselines")
def run(self): """ Creates and runs training episode :param: :return: """ data_provider = DataProvider(self.config) hex_attr_df = data_provider.read_hex_bin_attributes() hex_distance_df = data_provider.read_hex_bin_distances() city_states = data_provider.read_city_states( self.test_parameters['city_states_filename']) model = data_provider.read_model( self.test_parameters['model_filename']) neighborhood = data_provider.read_neighborhood_data() popular_bins = data_provider.read_popular_hex_bins() q_ind = model['q_ind'] r_table = model['r_table'] xi_matrix = model['xi_matrix'] episode_id = 0 # Create episode ind_exploration_factor = 0.0 episode = Episode(self.config, episode_id, ind_exploration_factor, hex_attr_df, hex_distance_df, city_states, neighborhood, popular_bins, q_ind, r_table, xi_matrix, True) # Run episode tables = episode.run() q_ind = tables['q_ind'] r_table = tables['r_table'] xi_matrix = tables['xi_matrix'] episode_tracker = tables['episode_tracker'] self.testing_tracker.update_RL_tracker( 0, episode_tracker.gross_earnings, episode_tracker.successful_waits, episode_tracker.unsuccessful_waits, episode_tracker.unmet_demand, episode_tracker.relocation_rides, episode_tracker.DET, episode_tracker.DPRT, episode_tracker.DWT, episode_tracker.DRT, episode_tracker.DCT) self.logger.info(""" Expt: {} Earnings: {} Model: {} Test day: {} Num drivers: {} Pax rides: {} Relocation rides: {} Unmet demand: {} """.format( self.expt_name, episode_tracker.gross_earnings, self.test_parameters['model_filename'], self.test_parameters['city_states_filename'], self.config['RL_parameters']['num_drivers'], episode_tracker.successful_waits, episode_tracker.relocation_rides, episode_tracker.unmet_demand)) self.logger.info("----------------------------------") return self.testing_tracker
class Experiment02(object): """ Experiment02 class """ def __init__(self, config): """ Constructor :param config: :return: """ self.config = config self.logger = logging.getLogger("so_logger") self.data_provider = DataProvider(self.config) self.data_exporter = DataExporter(self.config) @staticmethod def run_algorithm(args): # Run algorithm alg = AlgorithmDriver() data = alg.run(*args) return data def set_scaling_factor(self,data): # Find appropriate scaling factor alg = SetCoverGreedy(self.config, data.submodular_func, data.E) sol = alg.run() submodular_val = data.submodular_func(sol) cost = data.cost_func(sol) scaling_factor = data.scaling_func(submodular_val, cost) # Update scaling factor data.scaling_factor = scaling_factor def run(self): """ Run experiment :param: :return: """ self.logger.info("Starting experiment 02") self.expt_config = self.config['experiment_configs']['experiment_02'] popular_threshold = self.expt_config['popular_threshold'] rare_threshold = self.expt_config['rare_threshold'] user_sample_ratios = [0.4,1] seeds = [i for i in range(6,10)] ks = [1,5,10,15,20,25,30,35,40,45,50] sampling_epsilon_values_stochastic = [0.1,0.05,0.01,0.005] error_epsilon_values_scaled_threshold = [0.2,0.15,0.1,0.05] num_sampled_skills = 50 rare_sample_fraction = 0.1 popular_sample_fraction = 0.1 scaling_factor = 800 alg = AlgorithmDriver() results = [] for seed in seeds: for user_sample_ratio in user_sample_ratios: self.logger.info("Experiment for user sample ratio: {} and scaling factor: {} and seed: {}".format(user_sample_ratio,scaling_factor,seed)) # Load dataset data = self.data_provider.read_freelancer_data_obj() config = self.config.copy() alg.create_sample(config, data, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold,popular_threshold, user_sample_ratio, seed) self.logger.info("Scaling factor for submodular function is: {}".format(scaling_factor)) # Distorted Greedy for k in ks: # Run algorithm start = timer() result = alg.run(self.config, data, "distorted_greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, k) end = timer() result['runtime'] = end - start results.append(result) self.logger.info("Algorithm: {} and k: {} and runtime: {}".format("distorted_greedy",k,end - start)) self.logger.info("\n") # Stochastic Distorted Greedy for k in ks: for sample_epsilon in sampling_epsilon_values_stochastic: # Run algorithm start = timer() result = alg.run(config, data, "stochastic_distorted_greedy", sample_epsilon, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, k) end = timer() result['runtime'] = end - start results.append(result) self.logger.info("Algorithm: {} and epsilon: {} and k: {} and runtime: {}".format("stochastic_distorted_greedy",sample_epsilon,k,end - start)) self.logger.info("\n") # Cost Scaled Greedy for k in ks: # Run algorithm start = timer() result = alg.run(self.config, data, "cost_scaled_greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, k) end = timer() result['runtime'] = end - start results.append(result) self.logger.info("Algorithm: {} and k: {} and runtime: {}".format("cost_scaled_greedy",k,end - start)) self.logger.info("\n") # Cost scaled lazy exact greedy for k in ks: # Run algorithm start = timer() result = alg.run(self.config, data, "cost_scaled_lazy_greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, k) end = timer() result['runtime'] = end - start results.append(result) self.logger.info("Algorithm: {} and k: {} and runtime: {}".format("cost_scaled_lazy_greedy",k,end - start)) self.logger.info("\n") # Greedy for k in ks: # Run algorithm start = timer() result = alg.run(self.config, data, "greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, k) end = timer() result['runtime'] = end - start results.append(result) self.logger.info("Algorithm: {} and k: {} and runtime: {}".format("greedy",k,end - start)) self.logger.info("\n") # Scaled Single Threshold Greedy for k in ks: for error_epsilon in error_epsilon_values_scaled_threshold: # Run algorithm start = timer() result = alg.run(self.config, data, "scaled_single_threshold_greedy", None, error_epsilon, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, k) end = timer() result['runtime'] = end - start results.append(result) self.logger.info("Algorithm: {} and epsilon: {} and k: {} and runtime: {}".format("scaled_single_threshold_greedy",error_epsilon,k,end - start)) self.logger.info("\n") # Baseline Top k for k in ks: # Run algorithm start = timer() result = alg.run(self.config, data, "baseline_topk", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, k) end = timer() result['runtime'] = end - start results.append(result) self.logger.info("Algorithm: {} and k: {} and runtime: {}".format("baseline_topk",k,end - start)) self.logger.info("\n") self.logger.info("Finished experiment 02") # Export results df = pd.DataFrame(results) self.data_exporter.export_csv_file(df, "experiment_02_freelancer_pop01_rare01_greedy.csv") self.logger.info("Exported experiment_02 results")
def run(self): """ Creates and runs training episode :param: :return: """ data_provider = DataProvider(self.config) hex_attr_df = data_provider.read_hex_bin_attributes() hex_distance_df = data_provider.read_hex_bin_distances() city_states = data_provider.read_city_states(self.city_states_filename) neighborhood = data_provider.read_neighborhood_data() popular_bins = data_provider.read_popular_hex_bins() num_episodes = self.config['RL_parameters']['num_episodes'] ind_episodes = self.config['RL_parameters']['ind_episodes'] exp_decay_multiplier = self.config['RL_parameters']['exp_decay_multiplier'] q_ind = None r_table = None xi_matrix = None best_episode = None best_model = {} progress_bar = tqdm(xrange(num_episodes)) for episode_id in progress_bar: progress_bar.set_description("Episode: {}".format(episode_id)) current_best = -1000000 # Create episode ind_exploration_factor = np.e ** (-1 * episode_id * exp_decay_multiplier / ind_episodes) episode = Episode(self.config, episode_id, ind_exploration_factor, hex_attr_df, hex_distance_df, city_states, neighborhood, popular_bins, q_ind, r_table, xi_matrix) # Run episode tables = episode.run() q_ind = tables['q_ind'] r_table = tables['r_table'] xi_matrix = tables['xi_matrix'] episode_tracker = tables['episode_tracker'] # Uncomment for logging if running a job, comment during experiments # otherwise it leads to insanely huge logging output which is useless # self.logger.info(""" # Expt: {} Episode: {} Earnings: {} # Pax rides: {} Relocation rides: {} Unmet demand: {} # """.format(self.expt_name, episode_id, # episode_tracker.gross_earnings, # episode_tracker.successful_waits, # episode_tracker.relocation_rides, # episode_tracker.unmet_demand)) # self.logger.info("----------------------------------") self.training_tracker.update_RL_tracker( episode_id, episode_tracker.gross_earnings, episode_tracker.successful_waits, episode_tracker.unsuccessful_waits, episode_tracker.unmet_demand, episode_tracker.relocation_rides, episode_tracker.DET, episode_tracker.DPRT, episode_tracker.DWT, episode_tracker.DRT, episode_tracker.DCT) # Keep track of the best episode if self.objective == 'revenue': if episode_tracker.gross_earnings >= current_best: best_episode = episode_tracker current_best = best_episode.gross_earnings else: # self.objective == 'pickups': if episode_tracker.successful_waits >= current_best: best_episode = episode_tracker current_best = episode_tracker.successful_waits # Keep track of the best model best_model['ind_exploration_factor'] = ind_exploration_factor best_model['config'] = self.config best_model['q_ind'] = q_ind best_model['r_table'] = r_table best_model['xi_matrix'] = xi_matrix best_model['training_tracker'] = self.training_tracker # After finishing training self.logger.info("Expt: {} Earnings: {} Met Demand: {} Unmet Demand: {}".format(self.expt_name, best_episode.gross_earnings, best_episode.successful_waits, best_episode.unmet_demand)) return best_episode, best_model, self.training_tracker
def run(self): """ Execute the job :param: :return: """ self.logger.info("Starting job: FreelancerDataProcessor\n") data_provider = DataProvider(self.config) data_exporter = DataExporter(self.config) # Read freelancer data df = data_provider.read_freelancer_user_data() df_cost = df[[1]] # Salary/Hour df_skills = df[df.columns[4::2]] df_skills.replace(to_replace=["Other Skills"], value="", inplace=True) df_skills = (df_skills.iloc[:, 0].map(str) + ',' + df_skills.iloc[:, 1].map(str) + ',' + df_skills.iloc[:, 2].map(str) + ',' + df_skills.iloc[:, 3].map(str) + ',' + df_skills.iloc[:, 4].map(str) + ',' + df_skills.iloc[:, 5].map(str)) # Skills user_df = pd.DataFrame() user_df['cost'] = df_cost.iloc[:, 0].tolist() # Converting all strings to lower case user_df['skills'] = df_skills.str.lower().tolist() user_df = user_df.reset_index(drop=True) user_df = user_df.assign(user_id=user_df.index.values) user_df = user_df.assign(skills=user_df.apply( lambda x: x['skills'][:-1].split(','), axis=1)) # Convert cost to integers user_df.cost = user_df.cost.astype(int) # Read skills data df = data_provider.read_freelancer_skill_data() df = df[[1]] df.columns = ['skill'] skill_df = df.assign(skill_id=df.index.values) # Create multilabel binarizer mlb = MultiLabelBinarizer(classes=skill_df.skill.values) # One hot encoding of user skills skills = mlb.fit_transform(user_df['skills']) # Create dataset users = user_df.to_dict('records') for i in range(len(users)): users[i]['skills_array'] = skills[i] # Export csv files data_exporter.export_csv_file(user_df, "freelancer/freelancer_user_df.csv") data_exporter.export_csv_file(skill_df, "freelancer/freelancer_skill_df.csv") # Scaling factor for submodular function scaling_factor = 1 # Create and export data object to be used in experiments # containing all methods related to freelancer data freelancer = FreelancerData(self.config, user_df, skill_df, users, scaling_factor) data_exporter.export_dill_file(freelancer, "freelancer/freelancer_data.dill") self.logger.info("Finished job: FreelancerDataProcessor")
class Experiment03(object): """ Experiment03 class """ def __init__(self, config): """ Constructor :param config: :return: """ self.config = config self.logger = logging.getLogger("so_logger") self.data_provider = DataProvider(self.config) self.data_exporter = DataExporter(self.config) @staticmethod def run_algorithm(args): # Run algorithm alg = AlgorithmDriver() data = alg.run(*args) return data def set_scaling_factor(self, data): # Find appropriate scaling factor alg = SetCoverGreedy(self.config, data.submodular_func, data.E) sol = alg.run() submodular_val = data.submodular_func(sol) cost = data.cost_func(sol) scaling_factor = data.scaling_func(submodular_val, cost) # Update scaling factor data.scaling_factor = scaling_factor def run(self): """ Run experiment :param: :return: """ self.logger.info("Starting experiment 03") self.expt_config = self.config['experiment_configs']['experiment_03'] popular_threshold = self.expt_config['popular_threshold'] rare_threshold = self.expt_config['rare_threshold'] user_sample_ratios = [0.05, 0.1] seeds = [i for i in range(6, 10)] sampling_epsilon_values_stochastic = [0.1, 0.05, 0.01, 0.005] error_epsilon_values_scaled_threshold = [0.2, 0.15, 0.1, 0.05] num_sampled_skills = 50 rare_sample_fraction = 0.1 popular_sample_fraction = 0.1 scaling_factor = 800 alg = AlgorithmDriver() results = [] for seed in seeds: for user_sample_ratio in user_sample_ratios: # Load dataset data = self.data_provider.read_freelancer_data_obj() config = self.config.copy() alg.create_sample(config, data, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed) self.logger.info( "Experiment for user sample ratio: {} and scaling factor: {} and seed: {} and number of elements: {}" .format(user_sample_ratio, scaling_factor, seed, len(data.E))) self.logger.info( "Scaling factor for submodular function is: {}".format( scaling_factor)) # Total number of elements n = len(data.E) # Distorted Greedy total_runtime = 0 for k in range(1, n + 1): # Run algorithm start = timer() result = alg.run(self.config, data, "distorted_greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, k) end = timer() print('Previous runtime:', total_runtime, 'new runtime:', end - start) total_runtime += end - start result['runtime'] = total_runtime results.append(result) self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "distorted_greedy", k, total_runtime)) self.logger.info("\n") # Stochastic Distorted Greedy total_runtime = 0 for k in range(1, n + 1): for sample_epsilon in sampling_epsilon_values_stochastic: # Run algorithm start = timer() result = alg.run( config, data, "stochastic_distorted_greedy", sample_epsilon, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, k) end = timer() total_runtime += end - start result['runtime'] = total_runtime results.append(result) self.logger.info( "Algorithm: {} and epsilon: {} and k: {} and runtime: {}" .format("stochastic_distorted_greedy", sample_epsilon, k, total_runtime)) self.logger.info("\n") # Cost Scaled Greedy # Run algorithm that creates greedy ordering start = timer() result = alg.run(self.config, data, "cost_scaled_greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, n) end = timer() result['runtime'] = end - start # For each individual k we find the prefix of size k and find the corresponding solution for k in range(1, n + 1): result_k = result.copy() if k < len(result['sol']): sol_k = set(list(result['sol'])[:k]) submodular_val_k = data.submodular_func(sol_k) cost_k = data.cost_func(sol_k) val_k = submodular_val_k - cost_k result_k['sol'] = sol_k result_k['val'] = val_k result_k['submodular_val'] = submodular_val_k result_k['cost'] = cost_k else: sol_k = result['sol'] val_k = result['val'] result_k['k'] = k results.append(result_k) self.logger.info( "Best solution: {}\nBest value: {}".format( sol_k, val_k)) self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "cost_scaled_greedy", k, end - start)) self.logger.info("\n") # Cost scaled lazy exact greedy # Run algorithm that creates greedy ordering start = timer() result = alg.run(self.config, data, "cost_scaled_lazy_greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, n) end = timer() result['runtime'] = end - start # For each individual k we find the prefix of size k and find the corresponding solution for k in range(1, n + 1): result_k = result.copy() if k < len(result['sol']): sol_k = set(list(result['sol'])[:k]) submodular_val_k = data.submodular_func(sol_k) cost_k = data.cost_func(sol_k) val_k = submodular_val_k - cost_k result_k['sol'] = sol_k result_k['val'] = val_k result_k['submodular_val'] = submodular_val_k result_k['cost'] = cost_k else: sol_k = result['sol'] val_k = result['val'] result_k['k'] = k results.append(result_k) self.logger.info( "Best solution: {}\nBest value: {}".format( sol_k, val_k)) self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "cost_scaled_lazy_greedy", k, end - start)) self.logger.info("\n") # Scaled Single Threshold Greedy total_runtime = 0 for k in range(1, n + 1): for error_epsilon in error_epsilon_values_scaled_threshold: # Run algorithm start = timer() result = alg.run(self.config, data, "scaled_single_threshold_greedy", None, error_epsilon, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, k) end = timer() total_runtime += end - start result['runtime'] = total_runtime results.append(result) self.logger.info( "Algorithm: {} and epsilon: {} and k: {} and runtime: {}" .format("scaled_single_threshold_greedy", error_epsilon, k, total_runtime)) self.logger.info("\n") # Baseline Top k total_runtime = 0 for k in range(1, n + 1): # Run algorithm start = timer() result = alg.run(self.config, data, "baseline_topk", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, k) end = timer() total_runtime += end - start result['runtime'] = total_runtime results.append(result) self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "baseline_topk", k, total_runtime)) self.logger.info("\n") self.logger.info("Finished experiment 03") # Export results df = pd.DataFrame(results) # self.data_exporter.export_csv_file(df, "experiment_03_freelancer_pop01_rare01_cost_scaled.csv") self.logger.info("Exported experiment_03 results")
class Experiment00(object): """ Experiment00 class """ def __init__(self, config): """ Constructor :param config: :return: """ self.config = config self.logger = logging.getLogger("so_logger") self.data_provider = DataProvider(self.config) self.data_exporter = DataExporter(self.config) @staticmethod def run_algorithm(args): # Run algorithm alg = AlgorithmDriver() data = alg.run(*args) return data def set_scaling_factor(self, data): # Find appropriate scaling factor alg = SetCoverGreedy(self.config, data.submodular_func, data.E) sol = alg.run() submodular_val = data.submodular_func(sol) cost = data.cost_func(sol) scaling_factor = data.scaling_func(submodular_val, cost) # Update scaling factor data.scaling_factor = scaling_factor def run(self): """ Run experiment :param: :return: """ self.logger.info("Starting experiment 00") self.expt_config = self.config['experiment_configs']['experiment_00'] popular_threshold = self.expt_config['popular_threshold'] rare_threshold = self.expt_config['rare_threshold'] user_sample_ratios = [ 0.001, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1 ] seeds = [i for i in range(6, 11)] sampling_epsilon_values = [0.1, 0.05, 0.01, 0.005] num_sampled_skills = 50 rare_sample_fraction = 0.1 popular_sample_fraction = 0.1 scaling_factor = 800 alg = AlgorithmDriver() results = [] for seed in seeds: for user_sample_ratio in user_sample_ratios: self.logger.info( "Experiment for user sample ratio: {} and scaling factor: {} and seed: {}" .format(user_sample_ratio, scaling_factor, seed)) # Load dataset data = self.data_provider.read_freelancer_data_obj() config = self.config.copy() alg.create_sample(config, data, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed) # # Create controlled samples dataset # data.sample_skills_to_be_covered_controlled(num_sampled_skills, rare_sample_fraction, # popular_sample_fraction, rare_threshold, # popular_threshold, user_sample_ratio) # # Setting scaling factor of coverage as coverage(S)/cost(S) for set cover solution S # self.set_scaling_factor(data) self.logger.info( "Scaling factor for submodular function is: {}".format( scaling_factor)) # Distorted greedy - ICML start = timer() result = alg.run(config, data, "distorted_greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, None) end = timer() result['runtime'] = end - start self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "distorted_greedy", None, end - start)) results.append(result) self.logger.info("\n") # Cost scaled greedy start = timer() result = alg.run(config, data, "cost_scaled_greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, None) end = timer() result['runtime'] = end - start self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "cost_scaled_greedy", None, end - start)) results.append(result) self.logger.info("\n") # Cost scaled lazy exact greedy start = timer() result = alg.run(config, data, "cost_scaled_lazy_greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, None) end = timer() result['runtime'] = end - start self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "cost_scaled_lazy_greedy", None, end - start)) results.append(result) self.logger.info("\n") # Greedy start = timer() result = alg.run(config, data, "greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, None) end = timer() result['runtime'] = end - start self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "greedy", None, end - start)) results.append(result) self.logger.info("\n") # Unconstrained Linear start = timer() result = alg.run(config, data, "unconstrained_linear", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, None) end = timer() result['runtime'] = end - start self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "unconstrained_linear", None, end - start)) results.append(result) self.logger.info("\n") # Unconstrained distorted greedy start = timer() result = alg.run(config, data, "unconstrained_distorted_greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, None) end = timer() result['runtime'] = end - start self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "unconstrained_distorted_greedy", None, end - start)) results.append(result) self.logger.info("\n") # Stochastic distorted greedy for sample_epsilon in sampling_epsilon_values: start = timer() config['algorithms']['stochastic_distorted_greedy_config'][ 'epsilon'] = sample_epsilon result = alg.run(config, data, "stochastic_distorted_greedy", sample_epsilon, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, None) end = timer() result['runtime'] = end - start self.logger.info( "Algorithm: {} and epsilon: {} and k: {} and runtime: {}" .format("stochastic_distorted_greedy", sample_epsilon, None, end - start)) results.append(result) self.logger.info("\n") # Baseline top k start = timer() result = alg.run(config, data, "baseline_topk", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, None) end = timer() result['runtime'] = end - start self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "baseline_topk", None, end - start)) results.append(result) self.logger.info("Finished experiment 00") # Export results df = pd.DataFrame(results) self.data_exporter.export_csv_file( df, "experiment_00_freelancer_pop01_rare01_greedy.csv") self.logger.info("Exported experiment_00 results")
class Experiment04(object): """ Experiment04 class """ def __init__(self, config): """ Constructor :param config: :return: """ self.config = config self.logger = logging.getLogger("so_logger") self.data_provider = DataProvider(self.config) self.data_exporter = DataExporter(self.config) @staticmethod def run_algorithm(args): # Run algorithm alg = AlgorithmDriver() data = alg.run(*args) return data def set_scaling_factor(self, data): # Find appropriate scaling factor alg = SetCoverGreedy(self.config, data.submodular_func, data.E) sol = alg.run() submodular_val = data.submodular_func(sol) cost = data.cost_func(sol) scaling_factor = data.scaling_func(submodular_val, cost) # Update scaling factor data.scaling_factor = scaling_factor def run(self): """ Run experiment :param: :return: """ self.logger.info("Starting experiment 04") self.expt_config = self.config['experiment_configs']['experiment_04'] popular_threshold = self.expt_config['popular_threshold'] rare_threshold = self.expt_config['rare_threshold'] num_of_partitions = self.expt_config['num_of_partitions'] partition_type = self.expt_config['partition_type'] cardinality_constraint = self.expt_config['cardinality_constraint'] user_sample_ratios = [1] seeds = [i for i in range(6, 10)] cardinality_constraints = [i for i in range(1, 11)] num_of_partitions = [i for i in range(1, 6)] num_sampled_skills = 50 rare_sample_fraction = 0.1 popular_sample_fraction = 0.8 scaling_factor = 800 alg = AlgorithmDriver() results = [] for seed in seeds: for user_sample_ratio in user_sample_ratios: for cardinality_constraint in cardinality_constraints: for num_of_partition in num_of_partitions: self.logger.info( "Experiment for user sample ratio: {} and scaling factor: {} and seed: {} and cardinality constraint:{} and num of partitions:{} " .format(user_sample_ratio, scaling_factor, seed, cardinality_constraint, num_of_partition)) # Load dataset data = self.data_provider.read_guru_data_obj() config = self.config.copy() # Creating the ground set of users alg.create_sample(config, data, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed) # Assigning users to partitions uniformly at random alg.create_partitions(data, num_of_partition, partition_type, cardinality_constraint) self.logger.info( "Scaling factor for submodular function is: {}". format(scaling_factor)) # Partition matroid greedy start = timer() result = alg.run( config, data, "partition_matroid_greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, None) end = timer() result['runtime'] = end - start result[ 'cardinality_constraint'] = cardinality_constraint result['num_of_partitions'] = num_of_partition self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "partition_matroid_greedy", None, end - start)) results.append(result) self.logger.info("\n") # Cost scaled partition matroid greedy start = timer() result = alg.run( config, data, "cost_scaled_partition_matroid_greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, None) end = timer() result['runtime'] = end - start result[ 'cardinality_constraint'] = cardinality_constraint result['num_of_partitions'] = num_of_partition self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "cost_scaled_partition_matroid_greedy", None, end - start)) results.append(result) self.logger.info("\n") # Cost scaled partition matroid lazy exact greedy start = timer() result = alg.run( config, data, "cost_scaled_partition_matroid_lazy_greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, None) end = timer() result['runtime'] = end - start result[ 'cardinality_constraint'] = cardinality_constraint result['num_of_partitions'] = num_of_partition self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "cost_scaled_partition_matroid_lazy_greedy", None, end - start)) results.append(result) self.logger.info("\n") # Baseline Top k start = timer() result = alg.run(config, data, "baseline_topk_matroid", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, None) end = timer() result['runtime'] = end - start result[ 'cardinality_constraint'] = cardinality_constraint result['num_of_partitions'] = num_of_partition self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "baseline_topk_matroid", None, end - start)) results.append(result) self.logger.info("\n") self.logger.info("Finished experiment 04") # Export results df = pd.DataFrame(results) self.data_exporter.export_csv_file( df, "experiment_04_guru_salary_pop08_rare01.csv") self.logger.info("Exported experiment 04 results")
def __init__(self, config): self.config = config self.logger = logging.getLogger("baseline_logger") self.data_provider = DataProvider(self.config) self.data_exporter = DataExporter(self.config)