def __init__(self, config):
        """
        Constructor

        :param config:
        :return:
        """
        self.config = config
        self.logger = logging.getLogger("so_logger")
        self.data_provider = DataProvider(self.config)
        self.data_exporter = DataExporter(self.config)
    def run(self):
        """
        Execute the job
        :param:
        :return:
        """
        self.logger.info("Starting job: GuruDataProcessor\n")
        data_provider = DataProvider(self.config)
        data_exporter = DataExporter(self.config)

        # Read guru data
        df = data_provider.read_guru_user_data()
        df = df[[3, 4]]  # Salary and skills columns
        df.columns = ['cost', 'skills']
        df = df[(df.cost != "$0") & (df.skills != "UNKNOWN")]
        df = df.reset_index(drop=True)
        df = df.assign(user_id=df.index.values)
        df = df.assign(skills=df.apply(lambda x: x['skills'][:-1].split(','), axis=1))

        # Convert cost to integers
        user_df = df.assign(cost=df.apply(lambda x: int(x['cost'][1:]), axis=1))

        # Read skills data
        df = data_provider.read_guru_skill_data()
        df = df[[1]]
        df.columns = ['skill']
        skill_df = df.assign(skill_id=df.index.values)

        # Create multilabel binarizer
        mlb = MultiLabelBinarizer(classes=skill_df.skill.values)

        # One hot encoding of user skills
        skills = mlb.fit_transform(user_df['skills'])

        # Create dataset
        users = user_df.to_dict('records')
        for i in range(len(users)):
            users[i]['skills_array'] = skills[i]

        # Export csv files
        data_exporter.export_csv_file(user_df, "guru/guru_user_df.csv")
        data_exporter.export_csv_file(skill_df, "guru/guru_skill_df.csv")

        # Scaling factor for submodular function
        scaling_factor = 1

        # Create and export data object to be used in experiments
        # containing all methods related to guru data
        guru = GuruData(self.config, user_df, skill_df, users, scaling_factor)
        data_exporter.export_dill_file(guru, "guru/guru_data.dill")

        self.logger.info("Finished job: GuruDataProcessor")
예제 #3
0
 def __init__(self, config_):
     """
     Constructor
     :param config_:
     :return:
     """
     self.config = config_
     self.logger = logging.getLogger("cuda_logger")
     data_provider = DataProvider(self.config)
     filename = self.config['city_state_creator'].get(
         'filename', 'city_states.dill')
     self.city_states = data_provider.read_city_states(filename)
     self.reg_models = data_provider.read_regression_models()
 def __init__(self, config_, year, month, weekday):
     """
     Constructor
     :param config_:
     :return:
     """
     self.config = config_
     self.year = year
     self.month = month
     self.weekday = weekday
     self.logger = logging.getLogger("cuda_logger")
     data_provider = DataProvider(self.config)
     hex_attr_df = data_provider.read_hex_bin_attributes()
     hex_attr_df['center'] = hex_attr_df.apply(self.calculate_bin_center,
                                               axis=1)
     self.hex_attr_df = hex_attr_df
    def get_trips_data(self):
        """
        Gets data for appropriate weekday and month
        :param:
        :return df:
        """
        # Create SQL query
        query = """ \
                SELECT \
                    *,
                    HOUR(tpep_pickup_datetime) as pickup_hour\
                FROM \
                    `yt-{0}-{1}` \
                WHERE \
                    WEEKDAY(tpep_pickup_datetime) like {2} \
                AND pickup_bin IS NOT NULL \
                AND dropoff_bin IS NOT NULL \
                AND pickup_bin != dropoff_bin; \
                """

        # Read query output and select required columns
        df = DataProvider.read_sql_query(
            query.format(self.month, self.year, self.weekday))
        cols = [
            'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'trip_distance',
            'fare_amount', 'duration_seconds', 'pickup_bin', 'dropoff_bin',
            'pickup_hour'
        ]
        df = df[cols]

        return df
예제 #6
0
 def __init__(self, config_):
     """
     Constructor
     :param config_:
     :returns:
     """
     self.config = config_
     self.logger = logging.getLogger("cuda_logger")
     self.start_time = self.config["city_state_creator"]["start_time"]
     self.end_time = self.config["city_state_creator"]["end_time"]
     self.time_slice_duration = self.config["city_state_creator"][
         "time_slice_duration"]
     self.time_unit_duration = self.config["city_state_creator"][
         "time_unit_duration"]
     data_provider = DataProvider(self.config)
     hex_attr_df = data_provider.read_hex_bin_attributes()
     hex_dist_df = data_provider.read_hex_bin_distances()
     self.hex_bins = hex_attr_df['hex_id'].values
     self.hex_dist = hex_dist_df[[
         'pickup_bin', 'dropoff_bin', 'straight_line_distance'
     ]]
예제 #7
0
    def __init__(self, config_):
        """
        Constructor
        :param config_:
        :return:
        """
        self.config = config_
        self.logger = logging.getLogger("gym_logger")
        data_provider = DataProvider(self.config)

        # City state parameters
        self.city_states = data_provider.read_city_states()
        self.hex_attr_df = data_provider.read_hex_bin_attributes()
        self.hex_bins = self.hex_attr_df['hex_id']

        self.T = len(self.city_states)  # Number of time steps
        self.S = len(self.hex_bins)  # Number of hex bins

        # Environment parameters
        self.num_drivers = self.config['env_parameters']['num_drivers']
        self.distribution = self.config['env_parameters'][
            'driver_distribution']
        self.next_free_timestep = np.zeros(
            self.num_drivers)  # Next free timestep for each driver
        self.total_driver_earnings = np.zeros(
            self.num_drivers)  # Total earnings for each driver

        # Environment action and observation space
        actions = [7 for i in range(self.S)]
        drivers = [self.num_drivers for i in range(self.S)]
        self.action_space = spaces.MultiDiscrete(actions)
        # self.observation_space = spaces.Tuple((
        #     # spaces.Discrete(self.T),  # Time step
        #     spaces.MultiDiscrete(drivers)  # Driver distribution
        # ))
        self.observation_space = spaces.MultiDiscrete(drivers)

        self.reset()
    def run(self):
        """
        This method executes the job
        :param:
        :return:
        """
        self.logger.info("Starting job: NeighborhoodDataExportJob\n")
        data_provider = DataProvider(self.config)
        data_exporter = DataExporter(self.config)
        hex_attr_df = data_provider.read_hex_bin_attributes()
        hex_bins = hex_attr_df['hex_id'].values

        data = {}
        for r in xrange(self.radius + 1):
            data[r] = {}
            for hex_bin in hex_bins:
                neighbors = hex_neighborhood(hex_bin, hex_attr_df, r)
                zero_vector = np.zeros(len(hex_bins))
                np.put(zero_vector, neighbors, 1)
                one_hot_encoding_vector = zero_vector
                data[r][hex_bin] = one_hot_encoding_vector

        data_exporter.export_neighborhood_data(data)
        self.logger.info("Finished job: NeighborhoodDataExportJob")
class BaselineDriver(object):
    def __init__(self, config):
        self.config = config
        self.logger = logging.getLogger("baseline_logger")
        self.data_provider = DataProvider(self.config)
        self.data_exporter = DataExporter(self.config)

    @staticmethod
    def run_baseline(baseline_config):
        baseline_name = baseline_config['name']
        baseline_count = baseline_config['count']
        config = baseline_config['config']
        city_states = baseline_config['city_states']
        episode_rewards = []
        if baseline_name == "cDQN":
            baseline = cDQN(config)
            rewards = baseline.run(city_states)
            for _ in range(len(rewards)):
                episode_rewards.append({
                    'agent': 'cDQN',
                    'episode': _,
                    'run': baseline_count,
                    'earnings': rewards[_]
                })

        if baseline_name == "cA2C":
            baseline = cA2C(config)
            rewards = baseline.run(city_states)
            for _ in range(len(rewards)):
                episode_rewards.append({
                    'agent': 'cA2C',
                    'episode': _,
                    'run': baseline_count,
                    'earnings': rewards[_]
                })
        if baseline_name == "A2C":
            baseline = A2C(config)
            rewards = baseline.run(city_states)
            for _ in range(len(rewards)):
                episode_rewards.append({
                    'agent': 'A2C',
                    'episode': _,
                    'run': baseline_count,
                    'earnings': rewards[_]
                })
        return episode_rewards

    def run(self):
        self.logger.info("Starting baselines")
        city_states = self.data_provider.read_city_states()
        baseline_list = self.config['baselines']['baseline_list']

        # Create a pool of processes
        num_processes = mp.cpu_count()
        self.logger.info("Processes: {}".format(num_processes))
        pool = ProcessPool(nodes=num_processes)

        configs = []
        for count in range(10):
            for name in baseline_list:
                configs.append({
                    'name': name,
                    'count': count,
                    'config': self.config,
                    'city_states': city_states
                })

        results = pool.amap(self.run_baseline, configs).get()
        pool.close()
        pool.join()
        pool.clear()

        episode_rewards = []
        for result in results:
            episode_rewards += result

        self.data_exporter.export_baseline_data(episode_rewards)
        self.logger.info("Finished baselines")
예제 #10
0
    def run(self):
        """
        Creates and runs training episode
        :param:
        :return:
        """
        data_provider = DataProvider(self.config)
        hex_attr_df = data_provider.read_hex_bin_attributes()
        hex_distance_df = data_provider.read_hex_bin_distances()
        city_states = data_provider.read_city_states(
            self.test_parameters['city_states_filename'])
        model = data_provider.read_model(
            self.test_parameters['model_filename'])
        neighborhood = data_provider.read_neighborhood_data()
        popular_bins = data_provider.read_popular_hex_bins()

        q_ind = model['q_ind']
        r_table = model['r_table']
        xi_matrix = model['xi_matrix']

        episode_id = 0

        # Create episode
        ind_exploration_factor = 0.0

        episode = Episode(self.config, episode_id, ind_exploration_factor,
                          hex_attr_df, hex_distance_df, city_states,
                          neighborhood, popular_bins, q_ind, r_table,
                          xi_matrix, True)

        # Run episode
        tables = episode.run()
        q_ind = tables['q_ind']
        r_table = tables['r_table']
        xi_matrix = tables['xi_matrix']
        episode_tracker = tables['episode_tracker']

        self.testing_tracker.update_RL_tracker(
            0, episode_tracker.gross_earnings,
            episode_tracker.successful_waits,
            episode_tracker.unsuccessful_waits, episode_tracker.unmet_demand,
            episode_tracker.relocation_rides, episode_tracker.DET,
            episode_tracker.DPRT, episode_tracker.DWT, episode_tracker.DRT,
            episode_tracker.DCT)

        self.logger.info("""
                         Expt: {} Earnings: {}
                         Model: {}
                         Test day: {}
                         Num drivers: {}
                         Pax rides: {} Relocation rides: {} Unmet demand: {}
                         """.format(
            self.expt_name, episode_tracker.gross_earnings,
            self.test_parameters['model_filename'],
            self.test_parameters['city_states_filename'],
            self.config['RL_parameters']['num_drivers'],
            episode_tracker.successful_waits, episode_tracker.relocation_rides,
            episode_tracker.unmet_demand))
        self.logger.info("----------------------------------")

        return self.testing_tracker
class Experiment02(object):
    """
    Experiment02 class
    """

    def __init__(self, config):
        """
        Constructor

        :param config:
        :return:
        """
        self.config = config
        self.logger = logging.getLogger("so_logger")
        self.data_provider = DataProvider(self.config)
        self.data_exporter = DataExporter(self.config)

    @staticmethod
    def run_algorithm(args):
        # Run algorithm
        alg = AlgorithmDriver()
        data = alg.run(*args)
        return data

    def set_scaling_factor(self,data):

        # Find appropriate scaling factor
        alg = SetCoverGreedy(self.config, data.submodular_func, data.E)
        sol = alg.run()
        submodular_val = data.submodular_func(sol)
        cost = data.cost_func(sol)
        scaling_factor = data.scaling_func(submodular_val, cost)

        # Update scaling factor
        data.scaling_factor = scaling_factor

    def run(self):
        """
        Run experiment
        :param:
        :return:
        """
        self.logger.info("Starting experiment 02")

        self.expt_config = self.config['experiment_configs']['experiment_02']
        popular_threshold = self.expt_config['popular_threshold']
        rare_threshold = self.expt_config['rare_threshold']

        user_sample_ratios = [0.4,1]
        seeds = [i for i in range(6,10)]
        ks = [1,5,10,15,20,25,30,35,40,45,50]

        sampling_epsilon_values_stochastic = [0.1,0.05,0.01,0.005]
        error_epsilon_values_scaled_threshold = [0.2,0.15,0.1,0.05]

        num_sampled_skills = 50
        rare_sample_fraction = 0.1
        popular_sample_fraction = 0.1
        scaling_factor = 800

        alg = AlgorithmDriver()
        results = []
        for seed in seeds:
            for user_sample_ratio in user_sample_ratios:
                self.logger.info("Experiment for user sample ratio: {} and scaling factor: {} and seed: {}".format(user_sample_ratio,scaling_factor,seed))

                # Load dataset
                data = self.data_provider.read_freelancer_data_obj()
                config = self.config.copy()
                alg.create_sample(config, data, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, 
                                    rare_threshold,popular_threshold, user_sample_ratio, seed)


                self.logger.info("Scaling factor for submodular function is: {}".format(scaling_factor))

                # Distorted Greedy
                for k in ks:
                    # Run algorithm
                    start = timer()
                    result = alg.run(self.config, data, "distorted_greedy",
                         None, None, scaling_factor, num_sampled_skills,
                         rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold,
                         user_sample_ratio, seed, k)
                    end = timer()
                    result['runtime'] = end - start
                    results.append(result)
                    self.logger.info("Algorithm: {} and k: {} and runtime: {}".format("distorted_greedy",k,end - start))

                self.logger.info("\n")

                # Stochastic Distorted Greedy
                for k in ks:
                    for sample_epsilon in sampling_epsilon_values_stochastic:
                        # Run algorithm
                        start = timer()
                        result = alg.run(config, data, "stochastic_distorted_greedy",
                             sample_epsilon, None, scaling_factor, num_sampled_skills,
                             rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold,
                             user_sample_ratio, seed, k) 
                        end = timer()
                        result['runtime'] = end - start
                        results.append(result)
                        self.logger.info("Algorithm: {} and epsilon: {} and k: {} and runtime: {}".format("stochastic_distorted_greedy",sample_epsilon,k,end - start))

                self.logger.info("\n")

                # Cost Scaled Greedy              
                for k in ks:
                    # Run algorithm
                    start = timer()
                    result = alg.run(self.config, data, "cost_scaled_greedy",
                         None, None, scaling_factor, num_sampled_skills,
                         rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold,
                         user_sample_ratio, seed, k)
                    end = timer()
                    result['runtime'] = end - start
                    results.append(result)
                    self.logger.info("Algorithm: {} and k: {} and runtime: {}".format("cost_scaled_greedy",k,end - start))

                self.logger.info("\n")

                # Cost scaled lazy exact greedy
                for k in ks:
                    # Run algorithm
                    start = timer()
                    result = alg.run(self.config, data, "cost_scaled_lazy_greedy",
                         None, None, scaling_factor, num_sampled_skills,
                         rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold,
                         user_sample_ratio, seed, k)
                    end = timer()
                    result['runtime'] = end - start
                    results.append(result)
                    self.logger.info("Algorithm: {} and k: {} and runtime: {}".format("cost_scaled_lazy_greedy",k,end - start))

                self.logger.info("\n")

                # Greedy              
                for k in ks:
                    # Run algorithm
                    start = timer()
                    result = alg.run(self.config, data, "greedy",
                         None, None, scaling_factor, num_sampled_skills,
                         rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold,
                         user_sample_ratio, seed, k)
                    end = timer()
                    result['runtime'] = end - start
                    results.append(result)
                    self.logger.info("Algorithm: {} and k: {} and runtime: {}".format("greedy",k,end - start))

                self.logger.info("\n")

                # Scaled Single Threshold Greedy
                for k in ks:
                    for error_epsilon in error_epsilon_values_scaled_threshold:
                        # Run algorithm
                        start = timer()
                        result = alg.run(self.config, data, "scaled_single_threshold_greedy",
                             None, error_epsilon, scaling_factor, num_sampled_skills,
                             rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold,
                             user_sample_ratio, seed, k)
                        end = timer()
                        result['runtime'] = end - start
                        results.append(result)
                        self.logger.info("Algorithm: {} and epsilon: {} and k: {} and runtime: {}".format("scaled_single_threshold_greedy",error_epsilon,k,end - start))

                self.logger.info("\n")

                # Baseline Top k               
                for k in ks:
                    # Run algorithm
                    start = timer()
                    result = alg.run(self.config, data, "baseline_topk",
                         None, None, scaling_factor, num_sampled_skills,
                         rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold,
                         user_sample_ratio, seed, k)
                    end = timer()
                    result['runtime'] = end - start
                    results.append(result)
                    self.logger.info("Algorithm: {} and k: {} and runtime: {}".format("baseline_topk",k,end - start))

                self.logger.info("\n")


        self.logger.info("Finished experiment 02")

        # Export results
        df = pd.DataFrame(results)
        self.data_exporter.export_csv_file(df, "experiment_02_freelancer_pop01_rare01_greedy.csv")
        self.logger.info("Exported experiment_02 results")
    def run(self):
        """
        Creates and runs training episode
        :param:
        :return:
        """
        data_provider = DataProvider(self.config)
        hex_attr_df = data_provider.read_hex_bin_attributes()
        hex_distance_df = data_provider.read_hex_bin_distances()
        city_states = data_provider.read_city_states(self.city_states_filename)
        neighborhood = data_provider.read_neighborhood_data()
        popular_bins = data_provider.read_popular_hex_bins()
        num_episodes = self.config['RL_parameters']['num_episodes']
        ind_episodes = self.config['RL_parameters']['ind_episodes']
        exp_decay_multiplier = self.config['RL_parameters']['exp_decay_multiplier']

        q_ind = None
        r_table = None
        xi_matrix = None

        best_episode = None
        best_model = {}

        progress_bar = tqdm(xrange(num_episodes))
        for episode_id in progress_bar:
            progress_bar.set_description("Episode: {}".format(episode_id))
            current_best = -1000000

            # Create episode
            ind_exploration_factor = np.e ** (-1 * episode_id * exp_decay_multiplier / ind_episodes)

            episode = Episode(self.config,
                              episode_id,
                              ind_exploration_factor,
                              hex_attr_df,
                              hex_distance_df,
                              city_states,
                              neighborhood,
                              popular_bins,
                              q_ind,
                              r_table,
                              xi_matrix)

            # Run episode
            tables = episode.run()
            q_ind = tables['q_ind']
            r_table = tables['r_table']
            xi_matrix = tables['xi_matrix']
            episode_tracker = tables['episode_tracker']

            # Uncomment for logging if running a job, comment during experiments
            # otherwise it leads to insanely huge logging output which is useless

            # self.logger.info("""
            #                  Expt: {} Episode: {} Earnings: {}
            #                  Pax rides: {} Relocation rides: {} Unmet demand: {}
            #                  """.format(self.expt_name, episode_id,
            #                             episode_tracker.gross_earnings,
            #                             episode_tracker.successful_waits,
            #                             episode_tracker.relocation_rides,
            #                             episode_tracker.unmet_demand))
            # self.logger.info("----------------------------------")

            self.training_tracker.update_RL_tracker(
                episode_id, episode_tracker.gross_earnings,
                episode_tracker.successful_waits, episode_tracker.unsuccessful_waits,
                episode_tracker.unmet_demand, episode_tracker.relocation_rides,
                episode_tracker.DET, episode_tracker.DPRT, episode_tracker.DWT,
                episode_tracker.DRT, episode_tracker.DCT)

            # Keep track of the best episode
            if self.objective == 'revenue':
                if episode_tracker.gross_earnings >= current_best:
                    best_episode = episode_tracker
                    current_best = best_episode.gross_earnings
            else:  # self.objective == 'pickups':
                if episode_tracker.successful_waits >= current_best:
                    best_episode = episode_tracker
                    current_best = episode_tracker.successful_waits

            # Keep track of the best model
            best_model['ind_exploration_factor'] = ind_exploration_factor
            best_model['config'] = self.config
            best_model['q_ind'] = q_ind
            best_model['r_table'] = r_table
            best_model['xi_matrix'] = xi_matrix
            best_model['training_tracker'] = self.training_tracker

        # After finishing training
        self.logger.info("Expt: {} Earnings: {} Met Demand: {} Unmet Demand: {}".format(self.expt_name,
                                                                         best_episode.gross_earnings,
                                                                         best_episode.successful_waits,
                                                                         best_episode.unmet_demand))
        return best_episode, best_model, self.training_tracker
    def run(self):
        """
        Execute the job
        :param:
        :return:
        """
        self.logger.info("Starting job: FreelancerDataProcessor\n")
        data_provider = DataProvider(self.config)
        data_exporter = DataExporter(self.config)

        # Read freelancer data
        df = data_provider.read_freelancer_user_data()
        df_cost = df[[1]]  # Salary/Hour
        df_skills = df[df.columns[4::2]]
        df_skills.replace(to_replace=["Other Skills"], value="", inplace=True)
        df_skills = (df_skills.iloc[:, 0].map(str) + ',' +
                     df_skills.iloc[:, 1].map(str) + ',' +
                     df_skills.iloc[:, 2].map(str) + ',' +
                     df_skills.iloc[:, 3].map(str) + ',' +
                     df_skills.iloc[:, 4].map(str) + ',' +
                     df_skills.iloc[:, 5].map(str))  # Skills

        user_df = pd.DataFrame()
        user_df['cost'] = df_cost.iloc[:, 0].tolist()
        # Converting all strings to lower case
        user_df['skills'] = df_skills.str.lower().tolist()

        user_df = user_df.reset_index(drop=True)
        user_df = user_df.assign(user_id=user_df.index.values)
        user_df = user_df.assign(skills=user_df.apply(
            lambda x: x['skills'][:-1].split(','), axis=1))

        # Convert cost to integers
        user_df.cost = user_df.cost.astype(int)

        # Read skills data
        df = data_provider.read_freelancer_skill_data()
        df = df[[1]]
        df.columns = ['skill']
        skill_df = df.assign(skill_id=df.index.values)

        # Create multilabel binarizer
        mlb = MultiLabelBinarizer(classes=skill_df.skill.values)

        # One hot encoding of user skills
        skills = mlb.fit_transform(user_df['skills'])

        # Create dataset
        users = user_df.to_dict('records')
        for i in range(len(users)):
            users[i]['skills_array'] = skills[i]

        # Export csv files
        data_exporter.export_csv_file(user_df,
                                      "freelancer/freelancer_user_df.csv")
        data_exporter.export_csv_file(skill_df,
                                      "freelancer/freelancer_skill_df.csv")

        # Scaling factor for submodular function
        scaling_factor = 1

        # Create and export data object to be used in experiments
        # containing all methods related to freelancer data
        freelancer = FreelancerData(self.config, user_df, skill_df, users,
                                    scaling_factor)
        data_exporter.export_dill_file(freelancer,
                                       "freelancer/freelancer_data.dill")

        self.logger.info("Finished job: FreelancerDataProcessor")
class Experiment03(object):
    """
    Experiment03 class
    """
    def __init__(self, config):
        """
        Constructor

        :param config:
        :return:
        """
        self.config = config
        self.logger = logging.getLogger("so_logger")
        self.data_provider = DataProvider(self.config)
        self.data_exporter = DataExporter(self.config)

    @staticmethod
    def run_algorithm(args):
        # Run algorithm
        alg = AlgorithmDriver()
        data = alg.run(*args)
        return data

    def set_scaling_factor(self, data):

        # Find appropriate scaling factor
        alg = SetCoverGreedy(self.config, data.submodular_func, data.E)
        sol = alg.run()
        submodular_val = data.submodular_func(sol)
        cost = data.cost_func(sol)
        scaling_factor = data.scaling_func(submodular_val, cost)

        # Update scaling factor
        data.scaling_factor = scaling_factor

    def run(self):
        """
        Run experiment
        :param:
        :return:
        """
        self.logger.info("Starting experiment 03")

        self.expt_config = self.config['experiment_configs']['experiment_03']
        popular_threshold = self.expt_config['popular_threshold']
        rare_threshold = self.expt_config['rare_threshold']

        user_sample_ratios = [0.05, 0.1]
        seeds = [i for i in range(6, 10)]

        sampling_epsilon_values_stochastic = [0.1, 0.05, 0.01, 0.005]
        error_epsilon_values_scaled_threshold = [0.2, 0.15, 0.1, 0.05]

        num_sampled_skills = 50
        rare_sample_fraction = 0.1
        popular_sample_fraction = 0.1
        scaling_factor = 800

        alg = AlgorithmDriver()
        results = []
        for seed in seeds:
            for user_sample_ratio in user_sample_ratios:

                # Load dataset
                data = self.data_provider.read_freelancer_data_obj()
                config = self.config.copy()
                alg.create_sample(config, data, num_sampled_skills,
                                  rare_sample_fraction,
                                  popular_sample_fraction, rare_threshold,
                                  popular_threshold, user_sample_ratio, seed)

                self.logger.info(
                    "Experiment for user sample ratio: {} and scaling factor: {} and seed: {} and number of elements: {}"
                    .format(user_sample_ratio, scaling_factor, seed,
                            len(data.E)))
                self.logger.info(
                    "Scaling factor for submodular function is: {}".format(
                        scaling_factor))

                # Total number of elements
                n = len(data.E)

                # Distorted Greedy
                total_runtime = 0
                for k in range(1, n + 1):
                    # Run algorithm
                    start = timer()
                    result = alg.run(self.config, data, "distorted_greedy",
                                     None, None, scaling_factor,
                                     num_sampled_skills, rare_sample_fraction,
                                     popular_sample_fraction, rare_threshold,
                                     popular_threshold, user_sample_ratio,
                                     seed, k)
                    end = timer()
                    print('Previous runtime:', total_runtime, 'new runtime:',
                          end - start)
                    total_runtime += end - start
                    result['runtime'] = total_runtime
                    results.append(result)
                    self.logger.info(
                        "Algorithm: {} and k: {} and runtime: {}".format(
                            "distorted_greedy", k, total_runtime))

                self.logger.info("\n")

                # Stochastic Distorted Greedy
                total_runtime = 0
                for k in range(1, n + 1):
                    for sample_epsilon in sampling_epsilon_values_stochastic:
                        # Run algorithm
                        start = timer()
                        result = alg.run(
                            config, data, "stochastic_distorted_greedy",
                            sample_epsilon, None, scaling_factor,
                            num_sampled_skills, rare_sample_fraction,
                            popular_sample_fraction, rare_threshold,
                            popular_threshold, user_sample_ratio, seed, k)
                        end = timer()
                        total_runtime += end - start
                        result['runtime'] = total_runtime
                        results.append(result)
                        self.logger.info(
                            "Algorithm: {} and epsilon: {} and k: {} and runtime: {}"
                            .format("stochastic_distorted_greedy",
                                    sample_epsilon, k, total_runtime))

                self.logger.info("\n")

                # Cost Scaled Greedy
                # Run algorithm that creates greedy ordering
                start = timer()
                result = alg.run(self.config, data, "cost_scaled_greedy", None,
                                 None, scaling_factor, num_sampled_skills,
                                 rare_sample_fraction, popular_sample_fraction,
                                 rare_threshold, popular_threshold,
                                 user_sample_ratio, seed, n)
                end = timer()
                result['runtime'] = end - start
                # For each individual k we find the prefix of size k and find the corresponding solution
                for k in range(1, n + 1):
                    result_k = result.copy()
                    if k < len(result['sol']):
                        sol_k = set(list(result['sol'])[:k])
                        submodular_val_k = data.submodular_func(sol_k)
                        cost_k = data.cost_func(sol_k)
                        val_k = submodular_val_k - cost_k
                        result_k['sol'] = sol_k
                        result_k['val'] = val_k
                        result_k['submodular_val'] = submodular_val_k
                        result_k['cost'] = cost_k
                    else:
                        sol_k = result['sol']
                        val_k = result['val']
                    result_k['k'] = k
                    results.append(result_k)
                    self.logger.info(
                        "Best solution: {}\nBest value: {}".format(
                            sol_k, val_k))
                    self.logger.info(
                        "Algorithm: {} and k: {} and runtime: {}".format(
                            "cost_scaled_greedy", k, end - start))

                self.logger.info("\n")

                # Cost scaled lazy exact greedy
                # Run algorithm that creates greedy ordering
                start = timer()
                result = alg.run(self.config, data, "cost_scaled_lazy_greedy",
                                 None, None, scaling_factor,
                                 num_sampled_skills, rare_sample_fraction,
                                 popular_sample_fraction, rare_threshold,
                                 popular_threshold, user_sample_ratio, seed, n)
                end = timer()
                result['runtime'] = end - start
                # For each individual k we find the prefix of size k and find the corresponding solution
                for k in range(1, n + 1):
                    result_k = result.copy()
                    if k < len(result['sol']):
                        sol_k = set(list(result['sol'])[:k])
                        submodular_val_k = data.submodular_func(sol_k)
                        cost_k = data.cost_func(sol_k)
                        val_k = submodular_val_k - cost_k
                        result_k['sol'] = sol_k
                        result_k['val'] = val_k
                        result_k['submodular_val'] = submodular_val_k
                        result_k['cost'] = cost_k
                    else:
                        sol_k = result['sol']
                        val_k = result['val']
                    result_k['k'] = k
                    results.append(result_k)
                    self.logger.info(
                        "Best solution: {}\nBest value: {}".format(
                            sol_k, val_k))
                    self.logger.info(
                        "Algorithm: {} and k: {} and runtime: {}".format(
                            "cost_scaled_lazy_greedy", k, end - start))

                self.logger.info("\n")

                # Scaled Single Threshold Greedy
                total_runtime = 0
                for k in range(1, n + 1):
                    for error_epsilon in error_epsilon_values_scaled_threshold:
                        # Run algorithm
                        start = timer()
                        result = alg.run(self.config, data,
                                         "scaled_single_threshold_greedy",
                                         None, error_epsilon, scaling_factor,
                                         num_sampled_skills,
                                         rare_sample_fraction,
                                         popular_sample_fraction,
                                         rare_threshold, popular_threshold,
                                         user_sample_ratio, seed, k)
                        end = timer()
                        total_runtime += end - start
                        result['runtime'] = total_runtime
                        results.append(result)
                        self.logger.info(
                            "Algorithm: {} and epsilon: {} and k: {} and runtime: {}"
                            .format("scaled_single_threshold_greedy",
                                    error_epsilon, k, total_runtime))

                self.logger.info("\n")

                # Baseline Top k
                total_runtime = 0
                for k in range(1, n + 1):
                    # Run algorithm
                    start = timer()
                    result = alg.run(self.config, data, "baseline_topk", None,
                                     None, scaling_factor, num_sampled_skills,
                                     rare_sample_fraction,
                                     popular_sample_fraction, rare_threshold,
                                     popular_threshold, user_sample_ratio,
                                     seed, k)
                    end = timer()
                    total_runtime += end - start
                    result['runtime'] = total_runtime
                    results.append(result)
                    self.logger.info(
                        "Algorithm: {} and k: {} and runtime: {}".format(
                            "baseline_topk", k, total_runtime))

                self.logger.info("\n")

        self.logger.info("Finished experiment 03")

        # Export results
        df = pd.DataFrame(results)
        # self.data_exporter.export_csv_file(df, "experiment_03_freelancer_pop01_rare01_cost_scaled.csv")
        self.logger.info("Exported experiment_03 results")
class Experiment00(object):
    """
    Experiment00 class
    """
    def __init__(self, config):
        """
        Constructor

        :param config:
        :return:
        """
        self.config = config
        self.logger = logging.getLogger("so_logger")
        self.data_provider = DataProvider(self.config)
        self.data_exporter = DataExporter(self.config)

    @staticmethod
    def run_algorithm(args):
        # Run algorithm
        alg = AlgorithmDriver()
        data = alg.run(*args)
        return data

    def set_scaling_factor(self, data):

        # Find appropriate scaling factor
        alg = SetCoverGreedy(self.config, data.submodular_func, data.E)
        sol = alg.run()
        submodular_val = data.submodular_func(sol)
        cost = data.cost_func(sol)
        scaling_factor = data.scaling_func(submodular_val, cost)

        # Update scaling factor
        data.scaling_factor = scaling_factor

    def run(self):
        """
        Run experiment
        :param:
        :return:
        """
        self.logger.info("Starting experiment 00")

        self.expt_config = self.config['experiment_configs']['experiment_00']
        popular_threshold = self.expt_config['popular_threshold']
        rare_threshold = self.expt_config['rare_threshold']

        user_sample_ratios = [
            0.001, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1
        ]
        seeds = [i for i in range(6, 11)]

        sampling_epsilon_values = [0.1, 0.05, 0.01, 0.005]

        num_sampled_skills = 50
        rare_sample_fraction = 0.1
        popular_sample_fraction = 0.1
        scaling_factor = 800

        alg = AlgorithmDriver()
        results = []
        for seed in seeds:
            for user_sample_ratio in user_sample_ratios:
                self.logger.info(
                    "Experiment for user sample ratio: {} and scaling factor: {} and seed: {}"
                    .format(user_sample_ratio, scaling_factor, seed))

                # Load dataset
                data = self.data_provider.read_freelancer_data_obj()
                config = self.config.copy()
                alg.create_sample(config, data, num_sampled_skills,
                                  rare_sample_fraction,
                                  popular_sample_fraction, rare_threshold,
                                  popular_threshold, user_sample_ratio, seed)

                # # Create controlled samples dataset
                # data.sample_skills_to_be_covered_controlled(num_sampled_skills, rare_sample_fraction,
                #                                         popular_sample_fraction, rare_threshold,
                #                                         popular_threshold, user_sample_ratio)

                # # Setting scaling factor of coverage as coverage(S)/cost(S) for set cover solution S
                # self.set_scaling_factor(data)

                self.logger.info(
                    "Scaling factor for submodular function is: {}".format(
                        scaling_factor))

                # Distorted greedy - ICML
                start = timer()
                result = alg.run(config, data, "distorted_greedy", None, None,
                                 scaling_factor, num_sampled_skills,
                                 rare_sample_fraction, popular_sample_fraction,
                                 rare_threshold, popular_threshold,
                                 user_sample_ratio, seed, None)
                end = timer()
                result['runtime'] = end - start
                self.logger.info(
                    "Algorithm: {} and k: {} and runtime: {}".format(
                        "distorted_greedy", None, end - start))
                results.append(result)

                self.logger.info("\n")

                # Cost scaled greedy
                start = timer()
                result = alg.run(config, data, "cost_scaled_greedy", None,
                                 None, scaling_factor, num_sampled_skills,
                                 rare_sample_fraction, popular_sample_fraction,
                                 rare_threshold, popular_threshold,
                                 user_sample_ratio, seed, None)
                end = timer()
                result['runtime'] = end - start
                self.logger.info(
                    "Algorithm: {} and k: {} and runtime: {}".format(
                        "cost_scaled_greedy", None, end - start))
                results.append(result)

                self.logger.info("\n")

                # Cost scaled lazy exact greedy
                start = timer()
                result = alg.run(config, data, "cost_scaled_lazy_greedy", None,
                                 None, scaling_factor, num_sampled_skills,
                                 rare_sample_fraction, popular_sample_fraction,
                                 rare_threshold, popular_threshold,
                                 user_sample_ratio, seed, None)
                end = timer()
                result['runtime'] = end - start
                self.logger.info(
                    "Algorithm: {} and k: {} and runtime: {}".format(
                        "cost_scaled_lazy_greedy", None, end - start))
                results.append(result)

                self.logger.info("\n")

                # Greedy
                start = timer()
                result = alg.run(config, data, "greedy", None, None,
                                 scaling_factor, num_sampled_skills,
                                 rare_sample_fraction, popular_sample_fraction,
                                 rare_threshold, popular_threshold,
                                 user_sample_ratio, seed, None)
                end = timer()
                result['runtime'] = end - start
                self.logger.info(
                    "Algorithm: {} and k: {} and runtime: {}".format(
                        "greedy", None, end - start))
                results.append(result)

                self.logger.info("\n")

                # Unconstrained Linear
                start = timer()
                result = alg.run(config, data, "unconstrained_linear", None,
                                 None, scaling_factor, num_sampled_skills,
                                 rare_sample_fraction, popular_sample_fraction,
                                 rare_threshold, popular_threshold,
                                 user_sample_ratio, seed, None)
                end = timer()
                result['runtime'] = end - start
                self.logger.info(
                    "Algorithm: {} and k: {} and runtime: {}".format(
                        "unconstrained_linear", None, end - start))
                results.append(result)

                self.logger.info("\n")

                # Unconstrained distorted greedy
                start = timer()
                result = alg.run(config, data,
                                 "unconstrained_distorted_greedy", None, None,
                                 scaling_factor, num_sampled_skills,
                                 rare_sample_fraction, popular_sample_fraction,
                                 rare_threshold, popular_threshold,
                                 user_sample_ratio, seed, None)
                end = timer()
                result['runtime'] = end - start
                self.logger.info(
                    "Algorithm: {} and k: {} and runtime: {}".format(
                        "unconstrained_distorted_greedy", None, end - start))
                results.append(result)

                self.logger.info("\n")

                # Stochastic distorted greedy
                for sample_epsilon in sampling_epsilon_values:
                    start = timer()
                    config['algorithms']['stochastic_distorted_greedy_config'][
                        'epsilon'] = sample_epsilon
                    result = alg.run(config, data,
                                     "stochastic_distorted_greedy",
                                     sample_epsilon, None, scaling_factor,
                                     num_sampled_skills, rare_sample_fraction,
                                     popular_sample_fraction, rare_threshold,
                                     popular_threshold, user_sample_ratio,
                                     seed, None)
                    end = timer()
                    result['runtime'] = end - start
                    self.logger.info(
                        "Algorithm: {} and epsilon: {} and k: {} and runtime: {}"
                        .format("stochastic_distorted_greedy", sample_epsilon,
                                None, end - start))
                    results.append(result)

                self.logger.info("\n")

                # Baseline top k
                start = timer()
                result = alg.run(config, data, "baseline_topk", None, None,
                                 scaling_factor, num_sampled_skills,
                                 rare_sample_fraction, popular_sample_fraction,
                                 rare_threshold, popular_threshold,
                                 user_sample_ratio, seed, None)
                end = timer()
                result['runtime'] = end - start
                self.logger.info(
                    "Algorithm: {} and k: {} and runtime: {}".format(
                        "baseline_topk", None, end - start))
                results.append(result)

        self.logger.info("Finished experiment 00")

        # Export results
        df = pd.DataFrame(results)
        self.data_exporter.export_csv_file(
            df, "experiment_00_freelancer_pop01_rare01_greedy.csv")
        self.logger.info("Exported experiment_00 results")
예제 #16
0
class Experiment04(object):
    """
    Experiment04 class
    """
    def __init__(self, config):
        """
        Constructor

        :param config:
        :return:
        """
        self.config = config
        self.logger = logging.getLogger("so_logger")
        self.data_provider = DataProvider(self.config)
        self.data_exporter = DataExporter(self.config)

    @staticmethod
    def run_algorithm(args):
        # Run algorithm
        alg = AlgorithmDriver()
        data = alg.run(*args)
        return data

    def set_scaling_factor(self, data):

        # Find appropriate scaling factor
        alg = SetCoverGreedy(self.config, data.submodular_func, data.E)
        sol = alg.run()
        submodular_val = data.submodular_func(sol)
        cost = data.cost_func(sol)
        scaling_factor = data.scaling_func(submodular_val, cost)

        # Update scaling factor
        data.scaling_factor = scaling_factor

    def run(self):
        """
        Run experiment
        :param:
        :return:
        """
        self.logger.info("Starting experiment 04")

        self.expt_config = self.config['experiment_configs']['experiment_04']
        popular_threshold = self.expt_config['popular_threshold']
        rare_threshold = self.expt_config['rare_threshold']
        num_of_partitions = self.expt_config['num_of_partitions']
        partition_type = self.expt_config['partition_type']
        cardinality_constraint = self.expt_config['cardinality_constraint']

        user_sample_ratios = [1]
        seeds = [i for i in range(6, 10)]
        cardinality_constraints = [i for i in range(1, 11)]
        num_of_partitions = [i for i in range(1, 6)]

        num_sampled_skills = 50
        rare_sample_fraction = 0.1
        popular_sample_fraction = 0.8
        scaling_factor = 800

        alg = AlgorithmDriver()
        results = []
        for seed in seeds:
            for user_sample_ratio in user_sample_ratios:
                for cardinality_constraint in cardinality_constraints:
                    for num_of_partition in num_of_partitions:
                        self.logger.info(
                            "Experiment for user sample ratio: {} and scaling factor: {} and seed: {} and cardinality constraint:{} and num of partitions:{} "
                            .format(user_sample_ratio, scaling_factor, seed,
                                    cardinality_constraint, num_of_partition))

                        # Load dataset
                        data = self.data_provider.read_guru_data_obj()
                        config = self.config.copy()
                        # Creating the ground set of users
                        alg.create_sample(config, data, num_sampled_skills,
                                          rare_sample_fraction,
                                          popular_sample_fraction,
                                          rare_threshold, popular_threshold,
                                          user_sample_ratio, seed)
                        # Assigning users to partitions uniformly at random
                        alg.create_partitions(data, num_of_partition,
                                              partition_type,
                                              cardinality_constraint)

                        self.logger.info(
                            "Scaling factor for submodular function is: {}".
                            format(scaling_factor))

                        # Partition matroid greedy
                        start = timer()
                        result = alg.run(
                            config, data, "partition_matroid_greedy", None,
                            None, scaling_factor, num_sampled_skills,
                            rare_sample_fraction, popular_sample_fraction,
                            rare_threshold, popular_threshold,
                            user_sample_ratio, seed, None)
                        end = timer()
                        result['runtime'] = end - start
                        result[
                            'cardinality_constraint'] = cardinality_constraint
                        result['num_of_partitions'] = num_of_partition
                        self.logger.info(
                            "Algorithm: {} and k: {} and runtime: {}".format(
                                "partition_matroid_greedy", None, end - start))
                        results.append(result)

                        self.logger.info("\n")

                        # Cost scaled partition matroid greedy
                        start = timer()
                        result = alg.run(
                            config, data,
                            "cost_scaled_partition_matroid_greedy", None, None,
                            scaling_factor, num_sampled_skills,
                            rare_sample_fraction, popular_sample_fraction,
                            rare_threshold, popular_threshold,
                            user_sample_ratio, seed, None)
                        end = timer()
                        result['runtime'] = end - start
                        result[
                            'cardinality_constraint'] = cardinality_constraint
                        result['num_of_partitions'] = num_of_partition
                        self.logger.info(
                            "Algorithm: {} and k: {} and runtime: {}".format(
                                "cost_scaled_partition_matroid_greedy", None,
                                end - start))
                        results.append(result)

                        self.logger.info("\n")

                        # Cost scaled partition matroid lazy exact greedy
                        start = timer()
                        result = alg.run(
                            config, data,
                            "cost_scaled_partition_matroid_lazy_greedy", None,
                            None, scaling_factor, num_sampled_skills,
                            rare_sample_fraction, popular_sample_fraction,
                            rare_threshold, popular_threshold,
                            user_sample_ratio, seed, None)
                        end = timer()
                        result['runtime'] = end - start
                        result[
                            'cardinality_constraint'] = cardinality_constraint
                        result['num_of_partitions'] = num_of_partition
                        self.logger.info(
                            "Algorithm: {} and k: {} and runtime: {}".format(
                                "cost_scaled_partition_matroid_lazy_greedy",
                                None, end - start))
                        results.append(result)

                        self.logger.info("\n")

                        # Baseline Top k
                        start = timer()
                        result = alg.run(config, data, "baseline_topk_matroid",
                                         None, None, scaling_factor,
                                         num_sampled_skills,
                                         rare_sample_fraction,
                                         popular_sample_fraction,
                                         rare_threshold, popular_threshold,
                                         user_sample_ratio, seed, None)
                        end = timer()
                        result['runtime'] = end - start
                        result[
                            'cardinality_constraint'] = cardinality_constraint
                        result['num_of_partitions'] = num_of_partition
                        self.logger.info(
                            "Algorithm: {} and k: {} and runtime: {}".format(
                                "baseline_topk_matroid", None, end - start))
                        results.append(result)

                        self.logger.info("\n")

        self.logger.info("Finished experiment 04")

        # Export results
        df = pd.DataFrame(results)
        self.data_exporter.export_csv_file(
            df, "experiment_04_guru_salary_pop08_rare01.csv")
        self.logger.info("Exported experiment 04 results")
 def __init__(self, config):
     self.config = config
     self.logger = logging.getLogger("baseline_logger")
     self.data_provider = DataProvider(self.config)
     self.data_exporter = DataExporter(self.config)