def run(self): """ Run experiment :param: :return: """ self.logger.info("Starting experiment 02") self.expt_config = self.config['experiment_configs']['experiment_02'] popular_threshold = self.expt_config['popular_threshold'] rare_threshold = self.expt_config['rare_threshold'] user_sample_ratios = [0.4,1] seeds = [i for i in range(6,10)] ks = [1,5,10,15,20,25,30,35,40,45,50] sampling_epsilon_values_stochastic = [0.1,0.05,0.01,0.005] error_epsilon_values_scaled_threshold = [0.2,0.15,0.1,0.05] num_sampled_skills = 50 rare_sample_fraction = 0.1 popular_sample_fraction = 0.1 scaling_factor = 800 alg = AlgorithmDriver() results = [] for seed in seeds: for user_sample_ratio in user_sample_ratios: self.logger.info("Experiment for user sample ratio: {} and scaling factor: {} and seed: {}".format(user_sample_ratio,scaling_factor,seed)) # Load dataset data = self.data_provider.read_freelancer_data_obj() config = self.config.copy() alg.create_sample(config, data, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold,popular_threshold, user_sample_ratio, seed) self.logger.info("Scaling factor for submodular function is: {}".format(scaling_factor)) # Distorted Greedy for k in ks: # Run algorithm start = timer() result = alg.run(self.config, data, "distorted_greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, k) end = timer() result['runtime'] = end - start results.append(result) self.logger.info("Algorithm: {} and k: {} and runtime: {}".format("distorted_greedy",k,end - start)) self.logger.info("\n") # Stochastic Distorted Greedy for k in ks: for sample_epsilon in sampling_epsilon_values_stochastic: # Run algorithm start = timer() result = alg.run(config, data, "stochastic_distorted_greedy", sample_epsilon, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, k) end = timer() result['runtime'] = end - start results.append(result) self.logger.info("Algorithm: {} and epsilon: {} and k: {} and runtime: {}".format("stochastic_distorted_greedy",sample_epsilon,k,end - start)) self.logger.info("\n") # Cost Scaled Greedy for k in ks: # Run algorithm start = timer() result = alg.run(self.config, data, "cost_scaled_greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, k) end = timer() result['runtime'] = end - start results.append(result) self.logger.info("Algorithm: {} and k: {} and runtime: {}".format("cost_scaled_greedy",k,end - start)) self.logger.info("\n") # Cost scaled lazy exact greedy for k in ks: # Run algorithm start = timer() result = alg.run(self.config, data, "cost_scaled_lazy_greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, k) end = timer() result['runtime'] = end - start results.append(result) self.logger.info("Algorithm: {} and k: {} and runtime: {}".format("cost_scaled_lazy_greedy",k,end - start)) self.logger.info("\n") # Greedy for k in ks: # Run algorithm start = timer() result = alg.run(self.config, data, "greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, k) end = timer() result['runtime'] = end - start results.append(result) self.logger.info("Algorithm: {} and k: {} and runtime: {}".format("greedy",k,end - start)) self.logger.info("\n") # Scaled Single Threshold Greedy for k in ks: for error_epsilon in error_epsilon_values_scaled_threshold: # Run algorithm start = timer() result = alg.run(self.config, data, "scaled_single_threshold_greedy", None, error_epsilon, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, k) end = timer() result['runtime'] = end - start results.append(result) self.logger.info("Algorithm: {} and epsilon: {} and k: {} and runtime: {}".format("scaled_single_threshold_greedy",error_epsilon,k,end - start)) self.logger.info("\n") # Baseline Top k for k in ks: # Run algorithm start = timer() result = alg.run(self.config, data, "baseline_topk", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, k) end = timer() result['runtime'] = end - start results.append(result) self.logger.info("Algorithm: {} and k: {} and runtime: {}".format("baseline_topk",k,end - start)) self.logger.info("\n") self.logger.info("Finished experiment 02") # Export results df = pd.DataFrame(results) self.data_exporter.export_csv_file(df, "experiment_02_freelancer_pop01_rare01_greedy.csv") self.logger.info("Exported experiment_02 results")
def run_algorithm(args): # Run algorithm alg = AlgorithmDriver() data = alg.run(*args) return data
def run(self): """ Run experiment :param: :return: """ self.logger.info("Starting experiment 00") self.expt_config = self.config['experiment_configs']['experiment_00'] popular_threshold = self.expt_config['popular_threshold'] rare_threshold = self.expt_config['rare_threshold'] user_sample_ratios = [ 0.001, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1 ] seeds = [i for i in range(6, 11)] sampling_epsilon_values = [0.1, 0.05, 0.01, 0.005] num_sampled_skills = 50 rare_sample_fraction = 0.1 popular_sample_fraction = 0.1 scaling_factor = 800 alg = AlgorithmDriver() results = [] for seed in seeds: for user_sample_ratio in user_sample_ratios: self.logger.info( "Experiment for user sample ratio: {} and scaling factor: {} and seed: {}" .format(user_sample_ratio, scaling_factor, seed)) # Load dataset data = self.data_provider.read_freelancer_data_obj() config = self.config.copy() alg.create_sample(config, data, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed) # # Create controlled samples dataset # data.sample_skills_to_be_covered_controlled(num_sampled_skills, rare_sample_fraction, # popular_sample_fraction, rare_threshold, # popular_threshold, user_sample_ratio) # # Setting scaling factor of coverage as coverage(S)/cost(S) for set cover solution S # self.set_scaling_factor(data) self.logger.info( "Scaling factor for submodular function is: {}".format( scaling_factor)) # Distorted greedy - ICML start = timer() result = alg.run(config, data, "distorted_greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, None) end = timer() result['runtime'] = end - start self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "distorted_greedy", None, end - start)) results.append(result) self.logger.info("\n") # Cost scaled greedy start = timer() result = alg.run(config, data, "cost_scaled_greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, None) end = timer() result['runtime'] = end - start self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "cost_scaled_greedy", None, end - start)) results.append(result) self.logger.info("\n") # Cost scaled lazy exact greedy start = timer() result = alg.run(config, data, "cost_scaled_lazy_greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, None) end = timer() result['runtime'] = end - start self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "cost_scaled_lazy_greedy", None, end - start)) results.append(result) self.logger.info("\n") # Greedy start = timer() result = alg.run(config, data, "greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, None) end = timer() result['runtime'] = end - start self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "greedy", None, end - start)) results.append(result) self.logger.info("\n") # Unconstrained Linear start = timer() result = alg.run(config, data, "unconstrained_linear", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, None) end = timer() result['runtime'] = end - start self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "unconstrained_linear", None, end - start)) results.append(result) self.logger.info("\n") # Unconstrained distorted greedy start = timer() result = alg.run(config, data, "unconstrained_distorted_greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, None) end = timer() result['runtime'] = end - start self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "unconstrained_distorted_greedy", None, end - start)) results.append(result) self.logger.info("\n") # Stochastic distorted greedy for sample_epsilon in sampling_epsilon_values: start = timer() config['algorithms']['stochastic_distorted_greedy_config'][ 'epsilon'] = sample_epsilon result = alg.run(config, data, "stochastic_distorted_greedy", sample_epsilon, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, None) end = timer() result['runtime'] = end - start self.logger.info( "Algorithm: {} and epsilon: {} and k: {} and runtime: {}" .format("stochastic_distorted_greedy", sample_epsilon, None, end - start)) results.append(result) self.logger.info("\n") # Baseline top k start = timer() result = alg.run(config, data, "baseline_topk", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, None) end = timer() result['runtime'] = end - start self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "baseline_topk", None, end - start)) results.append(result) self.logger.info("Finished experiment 00") # Export results df = pd.DataFrame(results) self.data_exporter.export_csv_file( df, "experiment_00_freelancer_pop01_rare01_greedy.csv") self.logger.info("Exported experiment_00 results")
def run(self): """ Run experiment :param: :return: """ self.logger.info("Starting experiment 03") self.expt_config = self.config['experiment_configs']['experiment_03'] popular_threshold = self.expt_config['popular_threshold'] rare_threshold = self.expt_config['rare_threshold'] user_sample_ratios = [0.05, 0.1] seeds = [i for i in range(6, 10)] sampling_epsilon_values_stochastic = [0.1, 0.05, 0.01, 0.005] error_epsilon_values_scaled_threshold = [0.2, 0.15, 0.1, 0.05] num_sampled_skills = 50 rare_sample_fraction = 0.1 popular_sample_fraction = 0.1 scaling_factor = 800 alg = AlgorithmDriver() results = [] for seed in seeds: for user_sample_ratio in user_sample_ratios: # Load dataset data = self.data_provider.read_freelancer_data_obj() config = self.config.copy() alg.create_sample(config, data, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed) self.logger.info( "Experiment for user sample ratio: {} and scaling factor: {} and seed: {} and number of elements: {}" .format(user_sample_ratio, scaling_factor, seed, len(data.E))) self.logger.info( "Scaling factor for submodular function is: {}".format( scaling_factor)) # Total number of elements n = len(data.E) # Distorted Greedy total_runtime = 0 for k in range(1, n + 1): # Run algorithm start = timer() result = alg.run(self.config, data, "distorted_greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, k) end = timer() print('Previous runtime:', total_runtime, 'new runtime:', end - start) total_runtime += end - start result['runtime'] = total_runtime results.append(result) self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "distorted_greedy", k, total_runtime)) self.logger.info("\n") # Stochastic Distorted Greedy total_runtime = 0 for k in range(1, n + 1): for sample_epsilon in sampling_epsilon_values_stochastic: # Run algorithm start = timer() result = alg.run( config, data, "stochastic_distorted_greedy", sample_epsilon, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, k) end = timer() total_runtime += end - start result['runtime'] = total_runtime results.append(result) self.logger.info( "Algorithm: {} and epsilon: {} and k: {} and runtime: {}" .format("stochastic_distorted_greedy", sample_epsilon, k, total_runtime)) self.logger.info("\n") # Cost Scaled Greedy # Run algorithm that creates greedy ordering start = timer() result = alg.run(self.config, data, "cost_scaled_greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, n) end = timer() result['runtime'] = end - start # For each individual k we find the prefix of size k and find the corresponding solution for k in range(1, n + 1): result_k = result.copy() if k < len(result['sol']): sol_k = set(list(result['sol'])[:k]) submodular_val_k = data.submodular_func(sol_k) cost_k = data.cost_func(sol_k) val_k = submodular_val_k - cost_k result_k['sol'] = sol_k result_k['val'] = val_k result_k['submodular_val'] = submodular_val_k result_k['cost'] = cost_k else: sol_k = result['sol'] val_k = result['val'] result_k['k'] = k results.append(result_k) self.logger.info( "Best solution: {}\nBest value: {}".format( sol_k, val_k)) self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "cost_scaled_greedy", k, end - start)) self.logger.info("\n") # Cost scaled lazy exact greedy # Run algorithm that creates greedy ordering start = timer() result = alg.run(self.config, data, "cost_scaled_lazy_greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, n) end = timer() result['runtime'] = end - start # For each individual k we find the prefix of size k and find the corresponding solution for k in range(1, n + 1): result_k = result.copy() if k < len(result['sol']): sol_k = set(list(result['sol'])[:k]) submodular_val_k = data.submodular_func(sol_k) cost_k = data.cost_func(sol_k) val_k = submodular_val_k - cost_k result_k['sol'] = sol_k result_k['val'] = val_k result_k['submodular_val'] = submodular_val_k result_k['cost'] = cost_k else: sol_k = result['sol'] val_k = result['val'] result_k['k'] = k results.append(result_k) self.logger.info( "Best solution: {}\nBest value: {}".format( sol_k, val_k)) self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "cost_scaled_lazy_greedy", k, end - start)) self.logger.info("\n") # Scaled Single Threshold Greedy total_runtime = 0 for k in range(1, n + 1): for error_epsilon in error_epsilon_values_scaled_threshold: # Run algorithm start = timer() result = alg.run(self.config, data, "scaled_single_threshold_greedy", None, error_epsilon, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, k) end = timer() total_runtime += end - start result['runtime'] = total_runtime results.append(result) self.logger.info( "Algorithm: {} and epsilon: {} and k: {} and runtime: {}" .format("scaled_single_threshold_greedy", error_epsilon, k, total_runtime)) self.logger.info("\n") # Baseline Top k total_runtime = 0 for k in range(1, n + 1): # Run algorithm start = timer() result = alg.run(self.config, data, "baseline_topk", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, k) end = timer() total_runtime += end - start result['runtime'] = total_runtime results.append(result) self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "baseline_topk", k, total_runtime)) self.logger.info("\n") self.logger.info("Finished experiment 03") # Export results df = pd.DataFrame(results) # self.data_exporter.export_csv_file(df, "experiment_03_freelancer_pop01_rare01_cost_scaled.csv") self.logger.info("Exported experiment_03 results")
def run(self): """ Run experiment :param: :return: """ self.logger.info("Starting experiment 04") self.expt_config = self.config['experiment_configs']['experiment_04'] popular_threshold = self.expt_config['popular_threshold'] rare_threshold = self.expt_config['rare_threshold'] num_of_partitions = self.expt_config['num_of_partitions'] partition_type = self.expt_config['partition_type'] cardinality_constraint = self.expt_config['cardinality_constraint'] user_sample_ratios = [1] seeds = [i for i in range(6, 10)] cardinality_constraints = [i for i in range(1, 11)] num_of_partitions = [i for i in range(1, 6)] num_sampled_skills = 50 rare_sample_fraction = 0.1 popular_sample_fraction = 0.8 scaling_factor = 800 alg = AlgorithmDriver() results = [] for seed in seeds: for user_sample_ratio in user_sample_ratios: for cardinality_constraint in cardinality_constraints: for num_of_partition in num_of_partitions: self.logger.info( "Experiment for user sample ratio: {} and scaling factor: {} and seed: {} and cardinality constraint:{} and num of partitions:{} " .format(user_sample_ratio, scaling_factor, seed, cardinality_constraint, num_of_partition)) # Load dataset data = self.data_provider.read_guru_data_obj() config = self.config.copy() # Creating the ground set of users alg.create_sample(config, data, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed) # Assigning users to partitions uniformly at random alg.create_partitions(data, num_of_partition, partition_type, cardinality_constraint) self.logger.info( "Scaling factor for submodular function is: {}". format(scaling_factor)) # Partition matroid greedy start = timer() result = alg.run( config, data, "partition_matroid_greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, None) end = timer() result['runtime'] = end - start result[ 'cardinality_constraint'] = cardinality_constraint result['num_of_partitions'] = num_of_partition self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "partition_matroid_greedy", None, end - start)) results.append(result) self.logger.info("\n") # Cost scaled partition matroid greedy start = timer() result = alg.run( config, data, "cost_scaled_partition_matroid_greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, None) end = timer() result['runtime'] = end - start result[ 'cardinality_constraint'] = cardinality_constraint result['num_of_partitions'] = num_of_partition self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "cost_scaled_partition_matroid_greedy", None, end - start)) results.append(result) self.logger.info("\n") # Cost scaled partition matroid lazy exact greedy start = timer() result = alg.run( config, data, "cost_scaled_partition_matroid_lazy_greedy", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, None) end = timer() result['runtime'] = end - start result[ 'cardinality_constraint'] = cardinality_constraint result['num_of_partitions'] = num_of_partition self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "cost_scaled_partition_matroid_lazy_greedy", None, end - start)) results.append(result) self.logger.info("\n") # Baseline Top k start = timer() result = alg.run(config, data, "baseline_topk_matroid", None, None, scaling_factor, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold, user_sample_ratio, seed, None) end = timer() result['runtime'] = end - start result[ 'cardinality_constraint'] = cardinality_constraint result['num_of_partitions'] = num_of_partition self.logger.info( "Algorithm: {} and k: {} and runtime: {}".format( "baseline_topk_matroid", None, end - start)) results.append(result) self.logger.info("\n") self.logger.info("Finished experiment 04") # Export results df = pd.DataFrame(results) self.data_exporter.export_csv_file( df, "experiment_04_guru_salary_pop08_rare01.csv") self.logger.info("Exported experiment 04 results")