class Experiment03(object):
    """
    Experiment03 class
    """
    def __init__(self, config_):
        """
        Constructor
        :param config_:
        :return:
        """
        self.config = config_
        self.data_exporter = DataExporter(self.config)
        self.logger = logging.getLogger("cuda_logger")
        self.expt_name = "expt_03"
        self.config['RL_parameters']['experiment'] = self.expt_name

    @staticmethod
    def run_rl_training(config):
        rl_trainer = RunRLTrainingJob(config)
        data = rl_trainer.run()
        return data

    def run(self):
        """
        Run experiment
        """
        num_drivers = np.arange(1000, 6500, 500)
        objectives = ['pickups', 'revenue']
        combinations = list(itertools.product(num_drivers, objectives))

        # Create a pool of processes
        num_processes = mp.cpu_count()
        pool = ProcessPool(nodes=num_processes)

        configs = []
        count = 0
        for comb in combinations:
            self.config['RL_parameters'][
                'experiment'] = self.expt_name + "_" + str(count)
            self.config['RL_parameters'][
                'city_states_filename'] = "city_states.dill"
            self.config['RL_parameters']['num_drivers'] = comb[0]
            self.config['RL_parameters']['num_strategic_drivers'] = comb[0]
            self.config['RL_parameters']['objective'] = comb[1]
            configs.append(deepcopy(self.config))
            count += 1

        self.logger.info("Starting expt_03")

        results = pool.amap(self.run_rl_training, configs).get()
        pool.close()
        pool.join()
        pool.clear()

        self.logger.info("Finished expt_03")

        # Export best episode
        self.data_exporter.export_episode(results, self.expt_name + ".dill")
示例#2
0
 def __init__(self, config_):
     """
     Constructor
     :param config_:
     :return:
     """
     self.config = config_
     self.logger = logging.getLogger("cuda_logger")
     self.data_exporter = DataExporter(self.config)
class Experiment05(object):
    """
    Experiment05 class
    """

    def __init__(self, config_):
        """
        Constructor
        :param config_:
        :return:
        """
        self.config = config_
        self.data_exporter = DataExporter(self.config)
        self.logger = logging.getLogger("cuda_logger")
        self.expt_name = "expt_05"
        self.config['RL_parameters']['experiment'] = self.expt_name

    @staticmethod
    def run_rl_training(config):
        rl_trainer = RunRLTrainingJob(config)
        data = rl_trainer.run()
        return data

    def run(self):
        """
        Run experiment
        """
        num_drivers = self.config['RL_parameters']['num_drivers']
        percent_strategic_drivers = np.arange(0, 1.1, 0.1)
        num_strategic_drivers = [int(x * num_drivers) for x in percent_strategic_drivers]

        # Create a pool of processes
        num_processes = mp.cpu_count()
        pool = ProcessPool(nodes=num_processes)

        configs = []
        count = 0
        for drivers in num_strategic_drivers:
            self.config['RL_parameters']['experiment'] = self.expt_name + "_" + str(count)
            self.config['RL_parameters']['num_strategic_drivers'] = drivers
            configs.append(deepcopy(self.config))
            count += 1

        self.logger.info("Starting expt_05")

        results = pool.amap(self.run_rl_training, configs).get()
        pool.close()
        pool.join()
        pool.clear()

        self.logger.info("Finished expt_05")

        # Export best episode
        self.data_exporter.export_episode(results, self.expt_name + ".dill")
    def __init__(self, config):
        """
        Constructor

        :param config:
        :return:
        """
        self.config = config
        self.logger = logging.getLogger("so_logger")
        self.data_provider = DataProvider(self.config)
        self.data_exporter = DataExporter(self.config)
 def __init__(self, config_):
     """
     Constructor
     :param config_:
     :return:
     """
     self.config = config_
     self.data_exporter = DataExporter(self.config)
     self.logger = logging.getLogger("cuda_logger")
     self.expt_name = "expt_05"
     self.config['RL_parameters']['experiment'] = self.expt_name
示例#6
0
    def run(self):
        """
        This method executes the job
        :param:
        :return:
        """
        self.logger.info("Starting job: CreateCityStateJob\n")

        city_state_creator = CityStateCreator(self.config)
        city_state = city_state_creator.get_city_states()
        self.logger.info("Exporting city states\n")
        filename = self.config['city_state_creator'].get('filename', 'city_states.dill')
        data_exporter = DataExporter(self.config)
        data_exporter.export_city_state(city_state, filename)

        self.logger.info("Finished job: CreateCityStateJob")
示例#7
0
    def run(self):
        """
        This method executes the job
        :param:
        :return:
        """
        self.logger.info("Starting job: SparseMatrixFillerJob\n")

        sparse_matrix_filler = SparseMatrixFiller(self.config)
        city_state = sparse_matrix_filler.fill_matrices()
        self.logger.info("Exporting city states\n")
        filename = self.config['city_state_creator'].get(
            'filename', 'city_states.dill')
        data_exporter = DataExporter(self.config)
        data_exporter.export_city_state(city_state, filename)

        self.logger.info("Finished job: SparseMatrixFillerJob")
    def run(self):
        """
        Execute the job
        :param:
        :return:
        """
        self.logger.info("Starting job: GuruDataProcessor\n")
        data_provider = DataProvider(self.config)
        data_exporter = DataExporter(self.config)

        # Read guru data
        df = data_provider.read_guru_user_data()
        df = df[[3, 4]]  # Salary and skills columns
        df.columns = ['cost', 'skills']
        df = df[(df.cost != "$0") & (df.skills != "UNKNOWN")]
        df = df.reset_index(drop=True)
        df = df.assign(user_id=df.index.values)
        df = df.assign(skills=df.apply(lambda x: x['skills'][:-1].split(','), axis=1))

        # Convert cost to integers
        user_df = df.assign(cost=df.apply(lambda x: int(x['cost'][1:]), axis=1))

        # Read skills data
        df = data_provider.read_guru_skill_data()
        df = df[[1]]
        df.columns = ['skill']
        skill_df = df.assign(skill_id=df.index.values)

        # Create multilabel binarizer
        mlb = MultiLabelBinarizer(classes=skill_df.skill.values)

        # One hot encoding of user skills
        skills = mlb.fit_transform(user_df['skills'])

        # Create dataset
        users = user_df.to_dict('records')
        for i in range(len(users)):
            users[i]['skills_array'] = skills[i]

        # Export csv files
        data_exporter.export_csv_file(user_df, "guru/guru_user_df.csv")
        data_exporter.export_csv_file(skill_df, "guru/guru_skill_df.csv")

        # Scaling factor for submodular function
        scaling_factor = 1

        # Create and export data object to be used in experiments
        # containing all methods related to guru data
        guru = GuruData(self.config, user_df, skill_df, users, scaling_factor)
        data_exporter.export_dill_file(guru, "guru/guru_data.dill")

        self.logger.info("Finished job: GuruDataProcessor")
    def run(self):
        """
        This method executes the job
        :param:
        :return:
        """
        self.logger.info("Starting job: BuildRegressionModelsJob\n")

        # Export the regression models data
        for model in self.config['regression_models']:
            year = model['year'][-2:]
            month = model['month'].lower()
            weekday = list(calendar.day_name).index(model['weekday'])
            (self.logger.info(
                "Creating regression model data {}-{}-{}s".format(
                    year, month, model['weekday'])))

            rmb = RegressionModelBuilder(self.config, year, month, weekday)
            dist_df = rmb.get_bins_distance_dataframe()
            trips_df = rmb.create_trips_data_with_distance()

            (self.logger.info(
                "Exporting regression model data {}-{}-{}s\n".format(
                    year, month, model['weekday'])))
            data_exporter = DataExporter(self.config)
            data_exporter.export_bin_distances(dist_df)
            data_exporter.export_rmb_data(
                trips_df, model['weekday'] + '_' + month + '_' + year)

        self.logger.info("Finished job: BuildRegressionModelsJob")
示例#10
0
class RunRLTrainingJob(object):
    """
    This class implements a job to run the RL training
    """
    def __init__(self, config_):
        """
        Constructor
        :param config_:
        :return:
        """
        self.config = config_
        self.logger = logging.getLogger("cuda_logger")
        self.data_exporter = DataExporter(self.config)

    def run(self):
        """
        This method executes the job
        :param:
        :return:
        """
        self.logger.info("Starting job: RunRLTrainingJob\n")
        self.logger.info("RL training parameters:")

        pp = pprint.PrettyPrinter(indent=4)
        self.logger.info(pp.pprint(self.config['RL_parameters']))

        # Create RL trainer
        rl_trainer = RLTrainer(self.config)

        # Runs RL episodes
        result = rl_trainer.run()
        best_episode = result[0]
        best_model = result[1]
        training_tracker = result[2]

        # Export best_model
        best_model_filename = self.config['RL_parameters'].get(
            'best_model_filename', False)
        best_episode_filename = self.config['RL_parameters'].get(
            'best_episode_filename', False)
        training_tracker_filename = self.config['RL_parameters'].get(
            'training_tracker_filename', False)

        if best_model_filename:
            self.data_exporter.export_model(best_model, best_model_filename)

        if best_episode_filename:
            self.data_exporter.export_episode(best_episode,
                                              best_episode_filename)

        if training_tracker_filename:
            self.data_exporter.export_training_tracker(
                training_tracker, training_tracker_filename)

        self.logger.info("Finished job: RunRLTrainingJob")

        return best_episode
    def run(self):
        """
        This method executes the job
        :param:
        :return:
        """
        self.logger.info("Starting job: NeighborhoodDataExportJob\n")
        data_provider = DataProvider(self.config)
        data_exporter = DataExporter(self.config)
        hex_attr_df = data_provider.read_hex_bin_attributes()
        hex_bins = hex_attr_df['hex_id'].values

        data = {}
        for r in xrange(self.radius + 1):
            data[r] = {}
            for hex_bin in hex_bins:
                neighbors = hex_neighborhood(hex_bin, hex_attr_df, r)
                zero_vector = np.zeros(len(hex_bins))
                np.put(zero_vector, neighbors, 1)
                one_hot_encoding_vector = zero_vector
                data[r][hex_bin] = one_hot_encoding_vector

        data_exporter.export_neighborhood_data(data)
        self.logger.info("Finished job: NeighborhoodDataExportJob")
示例#12
0
class Experiment01(object):
    """
    Experiment01 class
    """

    def __init__(self, config_):
        """
        Constructor
        :param config_:
        :return:
        """
        self.config = config_
        self.data_exporter = DataExporter(self.config)
        self.logger = logging.getLogger("cuda_logger")
        self.expt_name = "expt_01"
        self.config['RL_parameters']['experiment'] = self.expt_name

    @staticmethod
    def run_rl_training(config):
        rl_trainer = RunRLTrainingJob(config)
        data = rl_trainer.run()
        return data

    def run(self):
        """
        Run experiment
        """
        ind_percent = np.arange(0., 1.1, 0.1)
        reb_percent = np.arange(0., 1.1, 0.1)
        ind_percent[0] = 0.01
        reb_percent[0] = 0.01
        combinations = list(itertools.product(ind_percent, reb_percent))
        num_episodes = self.config['RL_parameters']['num_episodes']

        # Create a pool of processes
        num_processes = mp.cpu_count()
        pool = ProcessPool(nodes=num_processes)

        configs = []
        count = 0
        for comb in combinations:
            self.config['RL_parameters']['experiment'] = self.expt_name + "_" + str(count)
            ind_episodes = int(comb[0] * num_episodes)
            reb_episodes = int(comb[1] * num_episodes)
            if (ind_episodes + reb_episodes) < num_episodes:
                self.config['RL_parameters']['ind_episodes'] = ind_episodes
                self.config['RL_parameters']['reb_episodes'] = reb_episodes
                configs.append(deepcopy(self.config))
                count += 1

        self.logger.info("Starting expt_01")

        results = pool.amap(self.run_rl_training, configs).get()
        pool.close()
        pool.join()
        pool.clear()

        self.logger.info("Finished expt_01")

        # Export best episode
        self.data_exporter.export_episode(results, self.expt_name + ".dill")
class Experiment02(object):
    """
    Experiment02 class
    """

    def __init__(self, config):
        """
        Constructor

        :param config:
        :return:
        """
        self.config = config
        self.logger = logging.getLogger("so_logger")
        self.data_provider = DataProvider(self.config)
        self.data_exporter = DataExporter(self.config)

    @staticmethod
    def run_algorithm(args):
        # Run algorithm
        alg = AlgorithmDriver()
        data = alg.run(*args)
        return data

    def set_scaling_factor(self,data):

        # Find appropriate scaling factor
        alg = SetCoverGreedy(self.config, data.submodular_func, data.E)
        sol = alg.run()
        submodular_val = data.submodular_func(sol)
        cost = data.cost_func(sol)
        scaling_factor = data.scaling_func(submodular_val, cost)

        # Update scaling factor
        data.scaling_factor = scaling_factor

    def run(self):
        """
        Run experiment
        :param:
        :return:
        """
        self.logger.info("Starting experiment 02")

        self.expt_config = self.config['experiment_configs']['experiment_02']
        popular_threshold = self.expt_config['popular_threshold']
        rare_threshold = self.expt_config['rare_threshold']

        user_sample_ratios = [0.4,1]
        seeds = [i for i in range(6,10)]
        ks = [1,5,10,15,20,25,30,35,40,45,50]

        sampling_epsilon_values_stochastic = [0.1,0.05,0.01,0.005]
        error_epsilon_values_scaled_threshold = [0.2,0.15,0.1,0.05]

        num_sampled_skills = 50
        rare_sample_fraction = 0.1
        popular_sample_fraction = 0.1
        scaling_factor = 800

        alg = AlgorithmDriver()
        results = []
        for seed in seeds:
            for user_sample_ratio in user_sample_ratios:
                self.logger.info("Experiment for user sample ratio: {} and scaling factor: {} and seed: {}".format(user_sample_ratio,scaling_factor,seed))

                # Load dataset
                data = self.data_provider.read_freelancer_data_obj()
                config = self.config.copy()
                alg.create_sample(config, data, num_sampled_skills, rare_sample_fraction, popular_sample_fraction, 
                                    rare_threshold,popular_threshold, user_sample_ratio, seed)


                self.logger.info("Scaling factor for submodular function is: {}".format(scaling_factor))

                # Distorted Greedy
                for k in ks:
                    # Run algorithm
                    start = timer()
                    result = alg.run(self.config, data, "distorted_greedy",
                         None, None, scaling_factor, num_sampled_skills,
                         rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold,
                         user_sample_ratio, seed, k)
                    end = timer()
                    result['runtime'] = end - start
                    results.append(result)
                    self.logger.info("Algorithm: {} and k: {} and runtime: {}".format("distorted_greedy",k,end - start))

                self.logger.info("\n")

                # Stochastic Distorted Greedy
                for k in ks:
                    for sample_epsilon in sampling_epsilon_values_stochastic:
                        # Run algorithm
                        start = timer()
                        result = alg.run(config, data, "stochastic_distorted_greedy",
                             sample_epsilon, None, scaling_factor, num_sampled_skills,
                             rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold,
                             user_sample_ratio, seed, k) 
                        end = timer()
                        result['runtime'] = end - start
                        results.append(result)
                        self.logger.info("Algorithm: {} and epsilon: {} and k: {} and runtime: {}".format("stochastic_distorted_greedy",sample_epsilon,k,end - start))

                self.logger.info("\n")

                # Cost Scaled Greedy              
                for k in ks:
                    # Run algorithm
                    start = timer()
                    result = alg.run(self.config, data, "cost_scaled_greedy",
                         None, None, scaling_factor, num_sampled_skills,
                         rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold,
                         user_sample_ratio, seed, k)
                    end = timer()
                    result['runtime'] = end - start
                    results.append(result)
                    self.logger.info("Algorithm: {} and k: {} and runtime: {}".format("cost_scaled_greedy",k,end - start))

                self.logger.info("\n")

                # Cost scaled lazy exact greedy
                for k in ks:
                    # Run algorithm
                    start = timer()
                    result = alg.run(self.config, data, "cost_scaled_lazy_greedy",
                         None, None, scaling_factor, num_sampled_skills,
                         rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold,
                         user_sample_ratio, seed, k)
                    end = timer()
                    result['runtime'] = end - start
                    results.append(result)
                    self.logger.info("Algorithm: {} and k: {} and runtime: {}".format("cost_scaled_lazy_greedy",k,end - start))

                self.logger.info("\n")

                # Greedy              
                for k in ks:
                    # Run algorithm
                    start = timer()
                    result = alg.run(self.config, data, "greedy",
                         None, None, scaling_factor, num_sampled_skills,
                         rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold,
                         user_sample_ratio, seed, k)
                    end = timer()
                    result['runtime'] = end - start
                    results.append(result)
                    self.logger.info("Algorithm: {} and k: {} and runtime: {}".format("greedy",k,end - start))

                self.logger.info("\n")

                # Scaled Single Threshold Greedy
                for k in ks:
                    for error_epsilon in error_epsilon_values_scaled_threshold:
                        # Run algorithm
                        start = timer()
                        result = alg.run(self.config, data, "scaled_single_threshold_greedy",
                             None, error_epsilon, scaling_factor, num_sampled_skills,
                             rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold,
                             user_sample_ratio, seed, k)
                        end = timer()
                        result['runtime'] = end - start
                        results.append(result)
                        self.logger.info("Algorithm: {} and epsilon: {} and k: {} and runtime: {}".format("scaled_single_threshold_greedy",error_epsilon,k,end - start))

                self.logger.info("\n")

                # Baseline Top k               
                for k in ks:
                    # Run algorithm
                    start = timer()
                    result = alg.run(self.config, data, "baseline_topk",
                         None, None, scaling_factor, num_sampled_skills,
                         rare_sample_fraction, popular_sample_fraction, rare_threshold, popular_threshold,
                         user_sample_ratio, seed, k)
                    end = timer()
                    result['runtime'] = end - start
                    results.append(result)
                    self.logger.info("Algorithm: {} and k: {} and runtime: {}".format("baseline_topk",k,end - start))

                self.logger.info("\n")


        self.logger.info("Finished experiment 02")

        # Export results
        df = pd.DataFrame(results)
        self.data_exporter.export_csv_file(df, "experiment_02_freelancer_pop01_rare01_greedy.csv")
        self.logger.info("Exported experiment_02 results")
class Experiment00(object):
    """
    Experiment00 class
    """
    def __init__(self, config):
        """
        Constructor

        :param config:
        :return:
        """
        self.config = config
        self.logger = logging.getLogger("so_logger")
        self.data_provider = DataProvider(self.config)
        self.data_exporter = DataExporter(self.config)

    @staticmethod
    def run_algorithm(args):
        # Run algorithm
        alg = AlgorithmDriver()
        data = alg.run(*args)
        return data

    def set_scaling_factor(self, data):

        # Find appropriate scaling factor
        alg = SetCoverGreedy(self.config, data.submodular_func, data.E)
        sol = alg.run()
        submodular_val = data.submodular_func(sol)
        cost = data.cost_func(sol)
        scaling_factor = data.scaling_func(submodular_val, cost)

        # Update scaling factor
        data.scaling_factor = scaling_factor

    def run(self):
        """
        Run experiment
        :param:
        :return:
        """
        self.logger.info("Starting experiment 00")

        self.expt_config = self.config['experiment_configs']['experiment_00']
        popular_threshold = self.expt_config['popular_threshold']
        rare_threshold = self.expt_config['rare_threshold']

        user_sample_ratios = [
            0.001, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1
        ]
        seeds = [i for i in range(6, 11)]

        sampling_epsilon_values = [0.1, 0.05, 0.01, 0.005]

        num_sampled_skills = 50
        rare_sample_fraction = 0.1
        popular_sample_fraction = 0.1
        scaling_factor = 800

        alg = AlgorithmDriver()
        results = []
        for seed in seeds:
            for user_sample_ratio in user_sample_ratios:
                self.logger.info(
                    "Experiment for user sample ratio: {} and scaling factor: {} and seed: {}"
                    .format(user_sample_ratio, scaling_factor, seed))

                # Load dataset
                data = self.data_provider.read_freelancer_data_obj()
                config = self.config.copy()
                alg.create_sample(config, data, num_sampled_skills,
                                  rare_sample_fraction,
                                  popular_sample_fraction, rare_threshold,
                                  popular_threshold, user_sample_ratio, seed)

                # # Create controlled samples dataset
                # data.sample_skills_to_be_covered_controlled(num_sampled_skills, rare_sample_fraction,
                #                                         popular_sample_fraction, rare_threshold,
                #                                         popular_threshold, user_sample_ratio)

                # # Setting scaling factor of coverage as coverage(S)/cost(S) for set cover solution S
                # self.set_scaling_factor(data)

                self.logger.info(
                    "Scaling factor for submodular function is: {}".format(
                        scaling_factor))

                # Distorted greedy - ICML
                start = timer()
                result = alg.run(config, data, "distorted_greedy", None, None,
                                 scaling_factor, num_sampled_skills,
                                 rare_sample_fraction, popular_sample_fraction,
                                 rare_threshold, popular_threshold,
                                 user_sample_ratio, seed, None)
                end = timer()
                result['runtime'] = end - start
                self.logger.info(
                    "Algorithm: {} and k: {} and runtime: {}".format(
                        "distorted_greedy", None, end - start))
                results.append(result)

                self.logger.info("\n")

                # Cost scaled greedy
                start = timer()
                result = alg.run(config, data, "cost_scaled_greedy", None,
                                 None, scaling_factor, num_sampled_skills,
                                 rare_sample_fraction, popular_sample_fraction,
                                 rare_threshold, popular_threshold,
                                 user_sample_ratio, seed, None)
                end = timer()
                result['runtime'] = end - start
                self.logger.info(
                    "Algorithm: {} and k: {} and runtime: {}".format(
                        "cost_scaled_greedy", None, end - start))
                results.append(result)

                self.logger.info("\n")

                # Cost scaled lazy exact greedy
                start = timer()
                result = alg.run(config, data, "cost_scaled_lazy_greedy", None,
                                 None, scaling_factor, num_sampled_skills,
                                 rare_sample_fraction, popular_sample_fraction,
                                 rare_threshold, popular_threshold,
                                 user_sample_ratio, seed, None)
                end = timer()
                result['runtime'] = end - start
                self.logger.info(
                    "Algorithm: {} and k: {} and runtime: {}".format(
                        "cost_scaled_lazy_greedy", None, end - start))
                results.append(result)

                self.logger.info("\n")

                # Greedy
                start = timer()
                result = alg.run(config, data, "greedy", None, None,
                                 scaling_factor, num_sampled_skills,
                                 rare_sample_fraction, popular_sample_fraction,
                                 rare_threshold, popular_threshold,
                                 user_sample_ratio, seed, None)
                end = timer()
                result['runtime'] = end - start
                self.logger.info(
                    "Algorithm: {} and k: {} and runtime: {}".format(
                        "greedy", None, end - start))
                results.append(result)

                self.logger.info("\n")

                # Unconstrained Linear
                start = timer()
                result = alg.run(config, data, "unconstrained_linear", None,
                                 None, scaling_factor, num_sampled_skills,
                                 rare_sample_fraction, popular_sample_fraction,
                                 rare_threshold, popular_threshold,
                                 user_sample_ratio, seed, None)
                end = timer()
                result['runtime'] = end - start
                self.logger.info(
                    "Algorithm: {} and k: {} and runtime: {}".format(
                        "unconstrained_linear", None, end - start))
                results.append(result)

                self.logger.info("\n")

                # Unconstrained distorted greedy
                start = timer()
                result = alg.run(config, data,
                                 "unconstrained_distorted_greedy", None, None,
                                 scaling_factor, num_sampled_skills,
                                 rare_sample_fraction, popular_sample_fraction,
                                 rare_threshold, popular_threshold,
                                 user_sample_ratio, seed, None)
                end = timer()
                result['runtime'] = end - start
                self.logger.info(
                    "Algorithm: {} and k: {} and runtime: {}".format(
                        "unconstrained_distorted_greedy", None, end - start))
                results.append(result)

                self.logger.info("\n")

                # Stochastic distorted greedy
                for sample_epsilon in sampling_epsilon_values:
                    start = timer()
                    config['algorithms']['stochastic_distorted_greedy_config'][
                        'epsilon'] = sample_epsilon
                    result = alg.run(config, data,
                                     "stochastic_distorted_greedy",
                                     sample_epsilon, None, scaling_factor,
                                     num_sampled_skills, rare_sample_fraction,
                                     popular_sample_fraction, rare_threshold,
                                     popular_threshold, user_sample_ratio,
                                     seed, None)
                    end = timer()
                    result['runtime'] = end - start
                    self.logger.info(
                        "Algorithm: {} and epsilon: {} and k: {} and runtime: {}"
                        .format("stochastic_distorted_greedy", sample_epsilon,
                                None, end - start))
                    results.append(result)

                self.logger.info("\n")

                # Baseline top k
                start = timer()
                result = alg.run(config, data, "baseline_topk", None, None,
                                 scaling_factor, num_sampled_skills,
                                 rare_sample_fraction, popular_sample_fraction,
                                 rare_threshold, popular_threshold,
                                 user_sample_ratio, seed, None)
                end = timer()
                result['runtime'] = end - start
                self.logger.info(
                    "Algorithm: {} and k: {} and runtime: {}".format(
                        "baseline_topk", None, end - start))
                results.append(result)

        self.logger.info("Finished experiment 00")

        # Export results
        df = pd.DataFrame(results)
        self.data_exporter.export_csv_file(
            df, "experiment_00_freelancer_pop01_rare01_greedy.csv")
        self.logger.info("Exported experiment_00 results")
class Experiment04(object):
    """
    Experiment04 class
    """
    def __init__(self, config_):
        """
        Constructor
        :param config_:
        :return:
        """
        self.config = config_
        self.data_exporter = DataExporter(self.config)
        self.logger = logging.getLogger("cuda_logger")
        self.expt_name = "expt_04"
        self.config['RL_parameters']['experiment'] = self.expt_name

    @staticmethod
    def run_rl_training(config):
        rl_trainer = RunRLTrainingJob(config)
        try:
            data = rl_trainer.run()
        except BaseException:
            print config
            raise ValueError
        return data

    def run(self):
        """
        Run experiment
        """
        num_drivers = np.arange(1000, 6500, 500)
        thresholds = np.arange(5, 55, 5)
        thresholds = np.insert(thresholds, 0, 2)
        combinations = list(itertools.product(num_drivers, thresholds))

        # Create a pool of processes
        num_processes = mp.cpu_count()
        self.logger.info("Processes: {}".format(num_processes))
        pool = ProcessPool(nodes=num_processes)

        configs = []
        count = 0
        for comb in combinations:
            self.config['RL_parameters'][
                'experiment'] = self.expt_name + "_" + str(count)
            self.config['RL_parameters']['num_drivers'] = comb[0]
            self.config['RL_parameters']['imbalance_threshold'] = comb[1]
            configs.append(deepcopy(self.config))
            count += 1

        self.logger.info("Starting expt_04")

        results = pool.amap(self.run_rl_training, configs).get()
        pool.close()
        pool.join()
        pool.clear()

        self.logger.info("Finished expt_04")

        # Export best episode
        self.data_exporter.export_episode(results, self.expt_name + ".dill")
 def __init__(self, config):
     self.config = config
     self.logger = logging.getLogger("baseline_logger")
     self.data_provider = DataProvider(self.config)
     self.data_exporter = DataExporter(self.config)
示例#17
0
class Experiment06(object):
    """
    Experiment06 class
    """
    def __init__(self, config_):
        """
        Constructor
        :param config_:
        :return:
        """
        self.config = config_
        self.data_exporter = DataExporter(self.config)
        self.logger = logging.getLogger("cuda_logger")
        self.expt_name = "expt_06"
        self.config['RL_parameters']['experiment'] = self.expt_name

    @staticmethod
    def run_rl_training(config):
        rl_trainer = RunRLTrainingJob(config)
        data = rl_trainer.run()
        return data

    def run(self):
        """
        Run experiment
        """
        days = [
            'Sunday_00_', 'Monday_00_', 'Tuesday_00_', 'Wednesday_00_',
            'Thursday_00_', 'Friday_00_', 'Saturday_00_', 'Sunday_01_',
            'Monday_01_', 'Tuesday_01_', 'Wednesday_01_', 'Thursday_01_',
            'Friday_01_', 'Saturday_01_', 'Sunday_02_', 'Monday_02_',
            'Tuesday_02_', 'Wednesday_02_', 'Thursday_02_', 'Friday_02_',
            'Saturday_02_', 'Sunday_03_', 'Monday_03_', 'Tuesday_03_',
            'Wednesday_03_', 'Thursday_03_', 'Friday_03_', 'Saturday_03_'
        ]

        # Create a pool of processes
        num_processes = mp.cpu_count()
        pool = ProcessPool(nodes=num_processes)

        configs = []
        count = 0

        for day in days:
            self.config['RL_parameters'][
                'experiment'] = self.expt_name + "_" + str(count)
            self.config['RL_parameters'][
                'city_states_filename'] = day + 'city_states.dill'
            self.config['RL_parameters'][
                'best_model_filename'] = day + 'model.dill'
            configs.append(deepcopy(self.config))
            count += 1

        self.logger.info("Starting expt_06")

        results = pool.amap(self.run_rl_training, configs).get()
        pool.close()
        pool.join()
        pool.clear()

        self.logger.info("Finished expt_06")

        # Export best episode
        self.data_exporter.export_episode(results, self.expt_name + ".dill")
    def run(self):
        """
        Execute the job
        :param:
        :return:
        """
        self.logger.info("Starting job: FreelancerDataProcessor\n")
        data_provider = DataProvider(self.config)
        data_exporter = DataExporter(self.config)

        # Read freelancer data
        df = data_provider.read_freelancer_user_data()
        df_cost = df[[1]]  # Salary/Hour
        df_skills = df[df.columns[4::2]]
        df_skills.replace(to_replace=["Other Skills"], value="", inplace=True)
        df_skills = (df_skills.iloc[:, 0].map(str) + ',' +
                     df_skills.iloc[:, 1].map(str) + ',' +
                     df_skills.iloc[:, 2].map(str) + ',' +
                     df_skills.iloc[:, 3].map(str) + ',' +
                     df_skills.iloc[:, 4].map(str) + ',' +
                     df_skills.iloc[:, 5].map(str))  # Skills

        user_df = pd.DataFrame()
        user_df['cost'] = df_cost.iloc[:, 0].tolist()
        # Converting all strings to lower case
        user_df['skills'] = df_skills.str.lower().tolist()

        user_df = user_df.reset_index(drop=True)
        user_df = user_df.assign(user_id=user_df.index.values)
        user_df = user_df.assign(skills=user_df.apply(
            lambda x: x['skills'][:-1].split(','), axis=1))

        # Convert cost to integers
        user_df.cost = user_df.cost.astype(int)

        # Read skills data
        df = data_provider.read_freelancer_skill_data()
        df = df[[1]]
        df.columns = ['skill']
        skill_df = df.assign(skill_id=df.index.values)

        # Create multilabel binarizer
        mlb = MultiLabelBinarizer(classes=skill_df.skill.values)

        # One hot encoding of user skills
        skills = mlb.fit_transform(user_df['skills'])

        # Create dataset
        users = user_df.to_dict('records')
        for i in range(len(users)):
            users[i]['skills_array'] = skills[i]

        # Export csv files
        data_exporter.export_csv_file(user_df,
                                      "freelancer/freelancer_user_df.csv")
        data_exporter.export_csv_file(skill_df,
                                      "freelancer/freelancer_skill_df.csv")

        # Scaling factor for submodular function
        scaling_factor = 1

        # Create and export data object to be used in experiments
        # containing all methods related to freelancer data
        freelancer = FreelancerData(self.config, user_df, skill_df, users,
                                    scaling_factor)
        data_exporter.export_dill_file(freelancer,
                                       "freelancer/freelancer_data.dill")

        self.logger.info("Finished job: FreelancerDataProcessor")
class Experiment08(object):
    """
    Experiment08 class
    """
    def __init__(self, config_):
        """
        Constructor
        :param config_:
        :return:
        """
        self.config = config_
        self.data_exporter = DataExporter(self.config)
        self.logger = logging.getLogger("cuda_logger")
        self.expt_name = "expt_08"
        self.config['Model_testing']['experiment'] = self.expt_name

    @staticmethod
    def run_rl_testing(config):
        rl_tester = RunRLTestingJob(config)
        data = rl_tester.run()
        return data

    def run(self):
        """
        Run experiment
        """
        days = [
            'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday',
            'Saturday'
        ]
        weeks_of_month = ['00', '01', '02', '03', '04']
        imbalance_thresholds = [2]
        model_num_drivers = [4000, 5000, 6000, 7000, 8000, 9000, 10000]

        test_combinations = []
        for model_day in days:
            for model_wom in weeks_of_month:
                for model_threshold in imbalance_thresholds:
                    for model_drivers in model_num_drivers:
                        model_args = [
                            model_day, model_wom,
                            str(model_drivers),
                            str(model_threshold)
                        ]
                        model_filename = "_".join(model_args) + "_model.dill"
                        if os.path.isfile(self.config['app']['DATA_DIR'] +
                                          'models/' + model_filename):
                            for test_wom in weeks_of_month:
                                for test_drivers in range(
                                        model_drivers - 3000,
                                        model_drivers + 4000, 1000):
                                    test_file = model_day + '_' + test_wom + '_city_states.dill'
                                    if os.path.isfile(
                                            self.config['app']['DATA_DIR'] +
                                            'city_states/' + test_file):
                                        test_combinations.append({
                                            'model':
                                            model_filename,
                                            'test_dow':
                                            model_day,
                                            'test_wom':
                                            test_wom,
                                            'test_drivers':
                                            test_drivers
                                        })
        self.logger.info("Total test combinations: {}".format(
            len(test_combinations)))

        # Create a pool of processes
        num_processes = mp.cpu_count()
        pool = ProcessPool(nodes=num_processes)

        configs = []
        count = 0

        for comb in test_combinations:
            self.config['Model_testing'][
                'experiment'] = self.expt_name + "_" + str(count)
            self.config['Model_testing']['city_states_filename'] = (
                comb['test_dow'] + '_' + comb['test_wom'] +
                '_city_states.dill')
            self.config['Model_testing']['model_filename'] = comb['model']
            self.config['RL_parameters']['num_drivers'] = comb['test_drivers']
            self.config['RL_parameters']['num_strategic_drivers'] = comb[
                'test_drivers']

            configs.append(deepcopy(self.config))
            count += 1

        self.logger.info("Starting expt_08")

        results = pool.amap(self.run_rl_testing, configs).get()
        pool.close()
        pool.join()
        pool.clear()

        self.logger.info("Finished expt_08")

        # Export best episode
        self.data_exporter.export_episode(results, self.expt_name + ".dill")
示例#20
0
class Experiment04(object):
    """
    Experiment04 class
    """
    def __init__(self, config):
        """
        Constructor

        :param config:
        :return:
        """
        self.config = config
        self.logger = logging.getLogger("so_logger")
        self.data_provider = DataProvider(self.config)
        self.data_exporter = DataExporter(self.config)

    @staticmethod
    def run_algorithm(args):
        # Run algorithm
        alg = AlgorithmDriver()
        data = alg.run(*args)
        return data

    def set_scaling_factor(self, data):

        # Find appropriate scaling factor
        alg = SetCoverGreedy(self.config, data.submodular_func, data.E)
        sol = alg.run()
        submodular_val = data.submodular_func(sol)
        cost = data.cost_func(sol)
        scaling_factor = data.scaling_func(submodular_val, cost)

        # Update scaling factor
        data.scaling_factor = scaling_factor

    def run(self):
        """
        Run experiment
        :param:
        :return:
        """
        self.logger.info("Starting experiment 04")

        self.expt_config = self.config['experiment_configs']['experiment_04']
        popular_threshold = self.expt_config['popular_threshold']
        rare_threshold = self.expt_config['rare_threshold']
        num_of_partitions = self.expt_config['num_of_partitions']
        partition_type = self.expt_config['partition_type']
        cardinality_constraint = self.expt_config['cardinality_constraint']

        user_sample_ratios = [1]
        seeds = [i for i in range(6, 10)]
        cardinality_constraints = [i for i in range(1, 11)]
        num_of_partitions = [i for i in range(1, 6)]

        num_sampled_skills = 50
        rare_sample_fraction = 0.1
        popular_sample_fraction = 0.8
        scaling_factor = 800

        alg = AlgorithmDriver()
        results = []
        for seed in seeds:
            for user_sample_ratio in user_sample_ratios:
                for cardinality_constraint in cardinality_constraints:
                    for num_of_partition in num_of_partitions:
                        self.logger.info(
                            "Experiment for user sample ratio: {} and scaling factor: {} and seed: {} and cardinality constraint:{} and num of partitions:{} "
                            .format(user_sample_ratio, scaling_factor, seed,
                                    cardinality_constraint, num_of_partition))

                        # Load dataset
                        data = self.data_provider.read_guru_data_obj()
                        config = self.config.copy()
                        # Creating the ground set of users
                        alg.create_sample(config, data, num_sampled_skills,
                                          rare_sample_fraction,
                                          popular_sample_fraction,
                                          rare_threshold, popular_threshold,
                                          user_sample_ratio, seed)
                        # Assigning users to partitions uniformly at random
                        alg.create_partitions(data, num_of_partition,
                                              partition_type,
                                              cardinality_constraint)

                        self.logger.info(
                            "Scaling factor for submodular function is: {}".
                            format(scaling_factor))

                        # Partition matroid greedy
                        start = timer()
                        result = alg.run(
                            config, data, "partition_matroid_greedy", None,
                            None, scaling_factor, num_sampled_skills,
                            rare_sample_fraction, popular_sample_fraction,
                            rare_threshold, popular_threshold,
                            user_sample_ratio, seed, None)
                        end = timer()
                        result['runtime'] = end - start
                        result[
                            'cardinality_constraint'] = cardinality_constraint
                        result['num_of_partitions'] = num_of_partition
                        self.logger.info(
                            "Algorithm: {} and k: {} and runtime: {}".format(
                                "partition_matroid_greedy", None, end - start))
                        results.append(result)

                        self.logger.info("\n")

                        # Cost scaled partition matroid greedy
                        start = timer()
                        result = alg.run(
                            config, data,
                            "cost_scaled_partition_matroid_greedy", None, None,
                            scaling_factor, num_sampled_skills,
                            rare_sample_fraction, popular_sample_fraction,
                            rare_threshold, popular_threshold,
                            user_sample_ratio, seed, None)
                        end = timer()
                        result['runtime'] = end - start
                        result[
                            'cardinality_constraint'] = cardinality_constraint
                        result['num_of_partitions'] = num_of_partition
                        self.logger.info(
                            "Algorithm: {} and k: {} and runtime: {}".format(
                                "cost_scaled_partition_matroid_greedy", None,
                                end - start))
                        results.append(result)

                        self.logger.info("\n")

                        # Cost scaled partition matroid lazy exact greedy
                        start = timer()
                        result = alg.run(
                            config, data,
                            "cost_scaled_partition_matroid_lazy_greedy", None,
                            None, scaling_factor, num_sampled_skills,
                            rare_sample_fraction, popular_sample_fraction,
                            rare_threshold, popular_threshold,
                            user_sample_ratio, seed, None)
                        end = timer()
                        result['runtime'] = end - start
                        result[
                            'cardinality_constraint'] = cardinality_constraint
                        result['num_of_partitions'] = num_of_partition
                        self.logger.info(
                            "Algorithm: {} and k: {} and runtime: {}".format(
                                "cost_scaled_partition_matroid_lazy_greedy",
                                None, end - start))
                        results.append(result)

                        self.logger.info("\n")

                        # Baseline Top k
                        start = timer()
                        result = alg.run(config, data, "baseline_topk_matroid",
                                         None, None, scaling_factor,
                                         num_sampled_skills,
                                         rare_sample_fraction,
                                         popular_sample_fraction,
                                         rare_threshold, popular_threshold,
                                         user_sample_ratio, seed, None)
                        end = timer()
                        result['runtime'] = end - start
                        result[
                            'cardinality_constraint'] = cardinality_constraint
                        result['num_of_partitions'] = num_of_partition
                        self.logger.info(
                            "Algorithm: {} and k: {} and runtime: {}".format(
                                "baseline_topk_matroid", None, end - start))
                        results.append(result)

                        self.logger.info("\n")

        self.logger.info("Finished experiment 04")

        # Export results
        df = pd.DataFrame(results)
        self.data_exporter.export_csv_file(
            df, "experiment_04_guru_salary_pop08_rare01.csv")
        self.logger.info("Exported experiment 04 results")
class BaselineDriver(object):
    def __init__(self, config):
        self.config = config
        self.logger = logging.getLogger("baseline_logger")
        self.data_provider = DataProvider(self.config)
        self.data_exporter = DataExporter(self.config)

    @staticmethod
    def run_baseline(baseline_config):
        baseline_name = baseline_config['name']
        baseline_count = baseline_config['count']
        config = baseline_config['config']
        city_states = baseline_config['city_states']
        episode_rewards = []
        if baseline_name == "cDQN":
            baseline = cDQN(config)
            rewards = baseline.run(city_states)
            for _ in range(len(rewards)):
                episode_rewards.append({
                    'agent': 'cDQN',
                    'episode': _,
                    'run': baseline_count,
                    'earnings': rewards[_]
                })

        if baseline_name == "cA2C":
            baseline = cA2C(config)
            rewards = baseline.run(city_states)
            for _ in range(len(rewards)):
                episode_rewards.append({
                    'agent': 'cA2C',
                    'episode': _,
                    'run': baseline_count,
                    'earnings': rewards[_]
                })
        if baseline_name == "A2C":
            baseline = A2C(config)
            rewards = baseline.run(city_states)
            for _ in range(len(rewards)):
                episode_rewards.append({
                    'agent': 'A2C',
                    'episode': _,
                    'run': baseline_count,
                    'earnings': rewards[_]
                })
        return episode_rewards

    def run(self):
        self.logger.info("Starting baselines")
        city_states = self.data_provider.read_city_states()
        baseline_list = self.config['baselines']['baseline_list']

        # Create a pool of processes
        num_processes = mp.cpu_count()
        self.logger.info("Processes: {}".format(num_processes))
        pool = ProcessPool(nodes=num_processes)

        configs = []
        for count in range(10):
            for name in baseline_list:
                configs.append({
                    'name': name,
                    'count': count,
                    'config': self.config,
                    'city_states': city_states
                })

        results = pool.amap(self.run_baseline, configs).get()
        pool.close()
        pool.join()
        pool.clear()

        episode_rewards = []
        for result in results:
            episode_rewards += result

        self.data_exporter.export_baseline_data(episode_rewards)
        self.logger.info("Finished baselines")