def __init__(self, user_agent, site_name=None, disable_update_check=False): """ Initialize our connection with a reddit. The user_agent is how your application identifies itself. Read the official API guidelines for user_agents https://github.com/reddit/reddit/wiki/API. Applications using default user_agents such as "Python/urllib" are drastically limited. site_name allows you to specify which reddit you want to connect to. The installation defaults are reddit.com, if you only need to connect to reddit.com then you can safely ignore this. If you want to connect to another reddit, set site_name to the name of that reddit. This must match with an entry in praw.ini. If site_name is None, then the site name will be looked for in the environment variable REDDIT_SITE. If it is not found there, the default site name reddit matching reddit.com will be used. disable_update_check allows you to prevent an update check from occuring in spite of the check_for_updates setting in praw.ini. """ if not user_agent or not isinstance(user_agent, six.string_types): raise TypeError('User agent must be a non-empty string.') self.DEFAULT_HEADERS['User-agent'] = UA_STRING % user_agent self.config = Config(site_name or os.getenv('REDDIT_SITE') or 'reddit') self.http = requests.session() self.modhash = self.user = None # Check for updates if permitted and this is the first Reddit instance if not disable_update_check and not self.update_checked \ and self.config.check_for_updates: update_check(__name__, __version__) self.update_checked = True
def autoclean(input_dataframe, drop_nans=False, copy=False, encoder=None, encoder_kwargs=None, ignore_update_check=False): """Performs a series of automated data cleaning transformations on the provided data set Parameters ---------- input_dataframe: pandas.DataFrame Data set to clean drop_nans: bool Drop all rows that have a NaN in any column (default: False) copy: bool Make a copy of the data set (default: False) encoder: category_encoders transformer The a valid category_encoders transformer which is passed an inferred cols list. Default (None: LabelEncoder) encoder_kwargs: category_encoders The a valid sklearn transformer to encode categorical features. Default (None) ignore_update_check: bool Do not check for the latest version of datacleaner Returns ---------- output_dataframe: pandas.DataFrame Cleaned data set """ global update_checked if ignore_update_check: update_checked = True if not update_checked: update_check('datacleaner', __version__) update_checked = True if copy: input_dataframe = input_dataframe.copy() if drop_nans: input_dataframe.dropna(inplace=True) if encoder_kwargs is None: encoder_kwargs = {} for column in input_dataframe.columns.values: # Replace NaNs with the median or mode of the column depending on the column type try: input_dataframe[column].fillna(input_dataframe[column].median(), inplace=True) except TypeError: input_dataframe[column].fillna(input_dataframe[column].mode()[0], inplace=True) # Encode all strings with numerical equivalents if str(input_dataframe[column].values.dtype) == 'object': if encoder is not None: column_encoder = encoder(**encoder_kwargs).fit(input_dataframe[column].values) else: column_encoder = LabelEncoder().fit(input_dataframe[column].values) input_dataframe[column] = column_encoder.transform(input_dataframe[column].values) return input_dataframe
def test_update_check__successful(self): prev_stdout = sys.stdout sys.stdout = StringIO() try: update_check(self.TRACKED_PACKAGE, '0.0.1', bypass_cache=True) finally: result = sys.stdout sys.stdout = prev_stdout self.assertTrue(len(result.getvalue()) > 0)
def test_update_check_failed(self): prev_stdout = sys.stdout sys.stdout = StringIO() try: update_check('update_checker_slkdflj', '0.0.1') finally: result = sys.stdout sys.stdout = prev_stdout self.assertTrue(len(result.getvalue()) == 0)
def test_update_check_successful(self): prev_stdout = sys.stdout sys.stdout = StringIO() try: update_check('update_checker', '0.0.1') finally: result = sys.stdout sys.stdout = prev_stdout self.assertTrue(len(result.getvalue()) > 0)
def autoclean(input_dataframe, drop_nans=False, copy=False, encoder=None, encoder_kwargs=None): """Performs a series of automated data cleaning transformations on the provided data set Parameters ---------- input_dataframe: pandas.DataFrame Data set to clean drop_nans: bool Drop all rows that have a NaN in any column (default: False) copy: bool Make a copy of the data set (default: False) encoder: category_encoders transformer The a valid category_encoders transformer which is passed an inferred cols list. Default (None: LabelEncoder) encoder_kwargs: category_encoders The a valid sklearn transformer to encode categorical features. Default (None) Returns ---------- output_dataframe: pandas.DataFrame Cleaned data set """ global update_checked if not update_checked: update_check('datacleaner', __version__) update_checked = True if encoder_kwargs is None: encoder_kwargs = {} if encoder is None: encoder = LabelEncoder if copy: input_dataframe = input_dataframe.copy() if drop_nans: input_dataframe.dropna(inplace=True) for column in input_dataframe.columns.values: # Replace NaNs with the median or mode of the column depending on the column type # If there are very many levels in the column, then it is probably continuous if len(input_dataframe[column].unique()) > 0.2 * len(input_dataframe): input_dataframe[column].fillna(input_dataframe[column].median(), inplace=True) else: input_dataframe[column].fillna(input_dataframe[column].mode()[0], inplace=True) # Encode all strings with numerical equivalents if str(input_dataframe[column].values.dtype) == 'object': input_dataframe[column] = encoder(**encoder_kwargs).fit_transform(input_dataframe[column].values) return input_dataframe
def __init__(self, config_section, plugin_dir, enable_logging): if not self.update_checked: update_check(__name__, __version__) self.update_checked = True self.start_time = datetime.utcnow() if plugin_dir: if os.path.isdir(plugin_dir): sys.path.append(plugin_dir) else: print ("`{0}` is not a directory.".format(plugin_dir)) config = self._get_config(config_section) self._delayed_events = [] self._loaded_plugins = {} self.api = Bot(config["auth_id"], config["user_id"], rate_limit=0.575) self.api.debug = enable_logging self.api.on("add_dj", self.handle_add_dj) self.api.on("booted_user", self.handle_booted_user) self.api.on("deregistered", self.handle_user_leave) self.api.on("new_moderator", self.handle_add_moderator) self.api.on("post_message", self.run_delayed_events) self.api.on("pmmed", self.handle_pm) self.api.on("ready", self.handle_ready) self.api.on("registered", self.handle_user_join) self.api.on("rem_dj", self.handle_remove_dj) self.api.on("rem_moderator", self.handle_remove_moderator) self.api.on("roomChanged", self.handle_room_change) self.api.on("speak", self.handle_room_message) self.bot_id = config["user_id"] self.commands = { "/about": self.cmd_about, "/commands": self.cmd_commands, "/help": self.cmd_help, "/join": self.cmd_join, "/leave": self.cmd_leave, "/pgload": self.cmd_plugin_load, "/pgreload": self.cmd_plugin_reload, "/pgunload": self.cmd_plugin_unload, "/plugins": self.cmd_plugins, "/uptime": self.cmd_uptime, } self.config = config self.dj_ids = set() self.listener_ids = set() self.max_djs = None self.moderator_ids = set() self.username = None # Load plugins after everything has been initialized for plugin in config["plugins"].split("\n"): self.load_plugin(plugin) self.api.connect(config["room_id"]) self.api.ws.on_error = handle_error
def test_update_check__untracked_package(self): prev_stdout = sys.stdout sys.stdout = StringIO() try: update_check(self.UNTRACKED_PACKAGE, '0.0.1', bypass_cache=True) finally: result = sys.stdout sys.stdout = prev_stdout self.assertEqual("update_checker does not support 'requests'\n", result.getvalue())
def test_update_check__unsuccessful(self): prev_stdout = sys.stdout sys.stdout = StringIO() try: update_check(self.TRACKED_PACKAGE, '0.0.1', bypass_cache=True, url='http://sdlkjsldfkjsdlkfj.com') finally: result = sys.stdout sys.stdout = prev_stdout self.assertTrue(len(result.getvalue()) == 0)
def __init__(self, config_section, plugin_dir, enable_logging): if not self.update_checked: update_check(__name__, __version__) self.update_checked = True self.start_time = datetime.utcnow() if plugin_dir: if os.path.isdir(plugin_dir): sys.path.append(plugin_dir) else: print('`{0}` is not a directory.'.format(plugin_dir)) config = self._get_config(config_section) self._delayed_events = [] self._loaded_plugins = {} self.api = Bot(config['auth_id'], config['user_id'], rate_limit=0.575) self.api.debug = enable_logging self.api.on('add_dj', self.handle_add_dj) self.api.on('booted_user', self.handle_booted_user) self.api.on('deregistered', self.handle_user_leave) self.api.on('new_moderator', self.handle_add_moderator) self.api.on('post_message', self.run_delayed_events) self.api.on('pmmed', self.handle_pm) self.api.on('ready', self.handle_ready) self.api.on('registered', self.handle_user_join) self.api.on('rem_dj', self.handle_remove_dj) self.api.on('rem_moderator', self.handle_remove_moderator) self.api.on('roomChanged', self.handle_room_change) self.api.on('speak', self.handle_room_message) self.bot_id = config['user_id'] self.commands = {'/about': self.cmd_about, '/commands': self.cmd_commands, '/help': self.cmd_help, '/join': self.cmd_join, '/leave': self.cmd_leave, '/pgload': self.cmd_plugin_load, '/pgreload': self.cmd_plugin_reload, '/pgunload': self.cmd_plugin_unload, '/plugins': self.cmd_plugins, '/uptime': self.cmd_uptime} self.config = config self.dj_ids = set() self.listener_ids = set() self.max_djs = None self.moderator_ids = set() self.username = None # Load plugins after everything has been initialized for plugin in config['plugins'].split('\n'): self.load_plugin(plugin) self.api.connect(config['room_id']) self.api.ws.on_error = handle_error
def main(): """Provide the entry point to the hackday_bot command.""" args = docopt(__doc__, version='hackday_bot v{}'.format(__version__)) logger = prepare_logger('DEBUG' if args['--debug'] else 'INFO') update_check(__package__, __version__) reddit = praw.Reddit(args['SITE'], check_for_updates=False, user_agent='hackday_bot/{}'.format(__version__)) subreddit = reddit.subreddit(args['SUBREDDIT']) try: subreddit.name except PrawcoreException: logger.error('Invalid subreddit: {}'.format(args['SUBREDDIT'])) return 1 return Bot(subreddit).run()
def autoclean(input_dataframe, drop_nans=False, copy=False, encoder=None, encoder_kwargs=None, ignore_update_check=False,**kwargs): """Performs a series of automated data cleaning transformations on the provided data set Parameters ---------- input_dataframe: pandas.DataFrame Data set to clean drop_nans: bool Drop all rows that have a NaN in any column (default: False) copy: bool Make a copy of the data set (default: False) encoder: category_encoders transformer The a valid category_encoders transformer which is passed an inferred cols list. Default (None: LabelEncoder) encoder_kwargs: category_encoders The a valid sklearn transformer to encode categorical features. Default (None) ignore_update_check: bool Do not check for the latest version of datacleaner fill_func : function or method or string in 'full_func_list' the function to fill nan Returns ---------- output_dataframe: pandas.DataFrame Cleaned data set """ global update_checked if ignore_update_check: update_checked = True if not update_checked: update_check('datacleaner', __version__) update_checked = True if copy: input_dataframe = input_dataframe.copy() if drop_nans: input_dataframe.dropna(inplace=True) if encoder_kwargs is None: encoder_kwargs = {} fill_func = kwargs.pop('fill_func',"median") import inspect assert inspect.isfunction(fill_func) or inspect.ismethod(fill_func) or type(fill_func) == str full_func_list = [ 'sum', 'max', 'min', 'argmax', 'argmin', 'mean', 'median','prod' ] if type(fill_func) == str and fill_func in full_func_list: fill_func = "nan{func}".format(func=fill_func) mod = __import__("numpy.lib.nanfunctions",fromlist=[fill_func]) fill_func = getattr(mod,fill_func) for column in input_dataframe.columns.values: # Replace NaNs with the median or mode of the column depending on the column type try: input_dataframe[column].fillna(fill_func(input_dataframe[column]), inplace=True) except TypeError: most_frequent = input_dataframe[column].mode() # If the mode can't be computed, use the nearest valid value # See https://github.com/rhiever/datacleaner/issues/8 if len(most_frequent) > 0: input_dataframe[column].fillna(input_dataframe[column].mode()[0], inplace=True) else: input_dataframe[column].fillna(method='bfill', inplace=True) input_dataframe[column].fillna(method='ffill', inplace=True) # Encode all strings with numerical equivalents if str(input_dataframe[column].values.dtype) == 'object': if encoder is not None: column_encoder = encoder(**encoder_kwargs).fit(input_dataframe[column].values) else: column_encoder = LabelEncoder().fit(input_dataframe[column].values) input_dataframe[column] = column_encoder.transform(input_dataframe[column].values) return input_dataframe
def autoclean_cv(training_dataframe, testing_dataframe, drop_nans=False, copy=False, encoder=None, encoder_kwargs=None): """Performs a series of automated data cleaning transformations on the provided training and testing data sets Unlike `autoclean()`, this function takes cross-validation into account by learning the data transformations from only the training set, then applying those transformations to both the training and testing set. By doing so, this function will prevent information leak from the training set into the testing set. Parameters ---------- training_dataframe: pandas.DataFrame Training data set testing_dataframe: pandas.DataFrame Testing data set drop_nans: bool Drop all rows that have a NaN in any column (default: False) copy: bool Make a copy of the data set (default: False) encoder: category_encoders transformer The a valid category_encoders transformer which is passed an inferred cols list. Default (None: LabelEncoder) encoder_kwargs: category_encoders The a valid sklearn transformer to encode categorical features. Default (None) Returns ---------- output_training_dataframe: pandas.DataFrame Cleaned training data set output_testing_dataframe: pandas.DataFrame Cleaned testing data set """ global update_checked if not update_checked: update_check('datacleaner', __version__) update_checked = True if set(training_dataframe.columns.values) != set(testing_dataframe.columns.values): raise ValueError('The training and testing DataFrames do not have the same columns. ' 'Make sure that you are providing the same columns.') if encoder_kwargs is None: encoder_kwargs = {} if encoder is None: encoder = LabelEncoder if copy: training_dataframe = training_dataframe.copy() testing_dataframe = testing_dataframe.copy() if drop_nans: training_dataframe.dropna(inplace=True) testing_dataframe.dropna(inplace=True) for column in training_dataframe.columns.values: # Replace NaNs with the median or mode of the column depending on the column type # If there are very many levels in the column, then it is probably continuous if len(training_dataframe[column].unique()) > 0.2 * len(training_dataframe): column_median = training_dataframe[column].median() training_dataframe[column].fillna(column_median, inplace=True) testing_dataframe[column].fillna(column_median, inplace=True) else: column_mode = training_dataframe[column].mode()[0] training_dataframe[column].fillna(column_mode, inplace=True) testing_dataframe[column].fillna(column_mode, inplace=True) # Encode all strings with numerical equivalents if str(training_dataframe[column].values.dtype) == 'object': if encoder is None: column_label_encoder = encoder(**encoder_kwargs).fit(training_dataframe[column].values) training_dataframe[column] = column_label_encoder.transform(training_dataframe[column].values) testing_dataframe[column] = column_label_encoder.transform(testing_dataframe[column].values) return training_dataframe, testing_dataframe
def main(): """Provide the entry point into the reddit_alert program.""" usage = 'Usage: %prog [options] KEYWORD...' parser = arg_parser(usage=usage) parser.add_option('-s', '--subreddit', action='append', help=('When at least one `-s` option is provided ' '(multiple can be) only alert for comments in the ' 'indicated subreddit(s).')) parser.add_option('-I', '--ignore-user', action='append', metavar='USER', help=('Ignore comments from the provided user. Can be ' 'supplied multiple times.')) parser.add_option('-m', '--message', metavar='USER', help=('When set, send a reddit message to USER with the ' 'alert. Requires the alert script to login.')) options, args = parser.parse_args() if not args: parser.error('At least one KEYWORD must be provided.') # Create the reddit session, and login if necessary session = praw.Reddit('reddit_alert (prawtools {0})'.format(__version__), site_name=options.site, disable_update_check=True) if options.message: session.login(options.user, options.pswd) msg_to = session.get_redditor(options.message) # Check for updates if not options.disable_update_check: update_check('prawtools', __version__) # Build regex args = [x.lower() for x in args] reg_prefix = r'(?:^|[^a-z])' # Any character (or start) can precede reg_suffix = r'(?:$|[^a-z])' # Any character (or end) can follow regex = re.compile(r'{0}({1}){2}'.format(reg_prefix, '|'.join(args), reg_suffix), re.IGNORECASE) # Determine subreddit or multireddit if options.subreddit: subreddit = '+'.join(sorted(options.subreddit)) else: subreddit = 'all' print('Alerting on:') for item in sorted(args): print(' * {0}'.format(item)) print ('using the comment stream: http://www.reddit.com/r/{0}/comments' .format(subreddit)) # Build ignore set if options.ignore_user: ignore_users = set(x.lower() for x in options.ignore_user) else: ignore_users = set() try: for comment in praw.helpers.comment_stream(session, subreddit, verbosity=options.verbose): if comment.author and comment.author.name.lower() in ignore_users: continue match = regex.search(comment.body) if match: keyword = match.group(1).lower() url = quick_url(comment) print('{0}: {1}'.format(keyword, url)) if options.message: msg_to.send_message( 'Reddit Alert: {0}'.format(keyword), '{0}\n\nby /u/{1}\n\n---\n\n{2}'.format( url, comment.author, comment.body)) except KeyboardInterrupt: sys.stderr.write('\n') print('Goodbye!\n')
# -*- coding: utf-8 -*- ''' Copyright 2016 Randal S. Olson This file is part of the TPOT library. The TPOT library is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. The TPOT library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with the TPOT library. If not, see http://www.gnu.org/licenses/. ''' from ._version import __version__ from .tpot import TPOT, main from update_checker import update_check # Prompt the user if their version is out of date update_check('tpot', __version__)
def _check_for_update(self): if UPDATE_CHECKER_MISSING: return if not Reddit.update_checked and self.config.check_for_updates: update_check(__package__, __version__) Reddit.update_checked = True
def _fit_init(self): # initialization for fit function if not self.warm_start or not hasattr(self, '_pareto_front'): self._pop = [] self._pareto_front = None self._last_optimized_pareto_front = None self._last_optimized_pareto_front_n_gens = 0 self._optimized_pipeline = None self._optimized_pipeline_score = None self._exported_pipeline_text = "" self.fitted_pipeline_ = None self._fitted_imputer = None self._imputed = False self._memory = None # initial Memory setting for sklearn pipeline # dont save periodic pipelines more often than this self._output_best_pipeline_period_seconds = 30 # Try crossover and mutation at most this many times for # any one given individual (or pair of individuals) self._max_mut_loops = 50 self._setup_config(self.config_dict) self.operators = [] self.arguments = [] for key in sorted(self._config_dict.keys()): op_class, arg_types = TPOTOperatorClassFactory( key, self._config_dict[key], BaseClass=Operator, ArgBaseClass=ARGType ) if op_class: self.operators.append(op_class) self.arguments += arg_types # Schedule TPOT to run for many generations if the user specifies a # run-time limit TPOT will automatically interrupt itself when the timer # runs out if self.max_time_mins is not None: self.generations = 1000000 # Prompt the user if their version is out of date if not self.disable_update_check: update_check('tpot', __version__) if self.mutation_rate + self.crossover_rate > 1: raise ValueError( 'The sum of the crossover and mutation probabilities must be <= 1.0.' ) self.operators_context = { 'make_pipeline': make_pipeline, 'make_union': make_union, 'StackingEstimator': StackingEstimator, 'FunctionTransformer': FunctionTransformer, 'copy': copy } self._pbar = None # Specifies where to output the progress messages (default: sys.stdout). # Maybe open this API in future version of TPOT.(io.TextIOWrapper or io.StringIO) self._file = sys.stdout # Dictionary of individuals that have already been evaluated in previous # generations self.evaluated_individuals_ = {} self._setup_scoring_function(self.scoring) if self.subsample <= 0.0 or self.subsample > 1.0: raise ValueError( 'The subsample ratio of the training instance must be in the range (0.0, 1.0].' ) if self.n_jobs == -1: self._n_jobs = cpu_count() else: self._n_jobs = self.n_jobs self._setup_pset() self._setup_toolbox() ## Additions to _fit_init # Initialise list to save the predictions and pipelines analysed by TPOT self.predictions = [] self.pipelines = [] self._exported_pipeline_text = [] # Save training sample on the TPOT Object self.features = None self.target = None self.evaluated_individuals = {} self.curr_generations = 0 self.log = {} # Add the Gaussian kernels so that they can be used by TPOT self.operators_context['RBF'] = eval('RBF') self.operators_context['Matern'] = eval('Matern') self.operators_context['RationalQuadratic'] = eval('RationalQuadratic') self.operators_context['ExpSineSquared'] = eval('ExpSineSquared') self.operators_context['DotProduct'] = eval('DotProduct') self.operators_context['ConstantKernel'] = eval('ConstantKernel')
def __init__(self, population_size=100, generations=100, mutation_rate=0.9, crossover_rate=0.05, scoring=None, num_cv_folds=3, max_time_mins=None, max_eval_time_mins=5, random_state=None, verbosity=0, disable_update_check=False): """Sets up the genetic programming algorithm for pipeline optimization. Parameters ---------- population_size: int (default: 100) The number of pipelines in the genetic algorithm population. Must be > 0.The more pipelines in the population, the slower TPOT will run, but it's also more likely to find better pipelines. generations: int (default: 100) The number of generations to run pipeline optimization for. Must be > 0. The more generations you give TPOT to run, the longer it takes, but it's also more likely to find better pipelines. mutation_rate: float (default: 0.9) The mutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This tells the genetic programming algorithm how many pipelines to apply random changes to every generation. We don't recommend that you tweak this parameter unless you know what you're doing. crossover_rate: float (default: 0.05) The crossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This tells the genetic programming algorithm how many pipelines to "breed" every generation. We don't recommend that you tweak this parameter unless you know what you're doing. scoring: function or str Function used to evaluate the quality of a given pipeline for the problem. By default, balanced class accuracy is used for classification problems, mean squared error for regression problems. TPOT assumes that this scoring function should be maximized, i.e., higher is better. Offers the same options as sklearn.model_selection.cross_val_score: ['accuracy', 'adjusted_rand_score', 'average_precision', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc'] num_cv_folds: int (default: 3) The number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT pipeline optimization process max_time_mins: int (default: None) How many minutes TPOT has to optimize the pipeline. If not None, this setting will override the `generations` parameter. max_eval_time_mins: int (default: 5) How many minutes TPOT has to optimize a single pipeline. Setting this parameter to higher values will allow TPOT to explore more complex pipelines but will also allow TPOT to run longer. random_state: int (default: 0) The random number generator seed for TPOT. Use this to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. verbosity: int (default: 0) How much information TPOT communicates while it's running. 0 = none, 1 = minimal, 2 = all disable_update_check: bool (default: False) Flag indicating whether the TPOT version checker should be disabled. Returns ------- None """ if self.__class__.__name__ == 'TPOTBase': raise RuntimeError('Do not instantiate the TPOTBase class directly; ' 'use TPOTRegressor or TPOTClassifier instead.') # Prompt the user if their version is out of date self.disable_update_check = disable_update_check if not self.disable_update_check: update_check('tpot', __version__) self._hof = None self._optimized_pipeline = None self._fitted_pipeline = None self.population_size = population_size self.generations = generations self.max_time_mins = max_time_mins self.max_eval_time_mins = max_eval_time_mins # Schedule TPOT to run for a very long time if the user specifies a run-time # limit TPOT will automatically interrupt itself when the timer runs out if not (max_time_mins is None): self.generations = 1000000 self.mutation_rate = mutation_rate self.crossover_rate = crossover_rate self.verbosity = verbosity self.operators_context = { 'make_pipeline': make_pipeline, 'make_union': make_union, 'VotingClassifier': VotingClassifier, 'FunctionTransformer': FunctionTransformer } self._pbar = None self._gp_generation = 0 self.random_state = random_state # If the user passed a custom scoring function, store it in the sklearn SCORERS dictionary if scoring: if hasattr(scoring, '__call__'): scoring_name = scoring.__name__ if 'loss' in scoring_name or 'error' in scoring_name: greater_is_better = False else: greater_is_better = True SCORERS[scoring_name] = make_scorer(scoring, greater_is_better=greater_is_better) self.scoring_function = scoring_name else: self.scoring_function = scoring self.num_cv_folds = num_cv_folds self._setup_pset() self._setup_toolbox()
def main(): """Provide the entry point to the subreddit_stats command. :returns: 0 on success, 1 otherwise """ parser = arg_parser(usage='usage: %prog [options] [SUBREDDIT]') parser.add_option('-s', '--submitters', type='int', default=5, help='Number of top submitters to display ' '[default %default]') parser.add_option('-c', '--commenters', type='int', default=10, help='Number of top commenters to display ' '[default %default]') parser.add_option('-a', '--after', help='Submission ID to fetch after') parser.add_option('-d', '--days', type='int', default=32, help=('Number of previous days to include submissions ' 'from. Use 0 for unlimited. Default: %default')) parser.add_option('-D', '--debug', action='store_true', help='Enable debugging mode. Does not post stats.') parser.add_option('-R', '--submission-reddit', help=('Subreddit to submit to. If not present, ' 'submits to the subreddit processed')) parser.add_option('-t', '--top', help=('Run on top submissions either by day, week, ' 'month, year, or all')) parser.add_option('', '--distinguished', action='store_true', help=('Include distinguished subissions and ' 'comments (default: False). Note that regular ' 'comments of distinguished submissions will still ' 'be included.')) parser.add_option('', '--no-self', action='store_true', help=('Do not include self posts (and their comments) in' ' the calculation.')) parser.add_option('', '--no-link', action='store_true', help=('Only include self posts (and their comments) in ' 'the calculation.')) parser.add_option('', '--prev', help='Provide the submission id of previous SRS page.') parser.add_option('', '--include-prev', action='store_true', help='Don\'t try to avoid overlap with a previous SRS.') parser.add_option('-o', '--output', help='Save result csv to named file.') options, args = parser.parse_args() if len(args) != 1: sys.stdout.write('Enter subreddit name: ') sys.stdout.flush() subject_reddit = sys.stdin.readline().strip() if not subject_reddit: parser.error('No subreddit name entered') else: subject_reddit = args[0] if not options.disable_update_check: # Check for updates update_check('prawtools', __version__) print('You chose to analyze this subreddit: {}'.format(subject_reddit)) if options.no_link and options.no_self: parser.error('You are choosing to exclude self posts but also only ' 'include self posts. Consider checking your arguments.') if options.submission_reddit: submission_reddit = options.submission_reddit else: submission_reddit = subject_reddit srs = SubRedditStats(subject_reddit, options.site, options.verbose, options.distinguished) if options.prev: srs.prev_stat(options.prev) if options.top: found = srs.fetch_top_submissions(options.top, options.no_self, options.no_link) else: since_last = not options.include_prev found = srs.fetch_recent_submissions(max_duration=options.days, after=options.after, exclude_self=options.no_self, exclude_link=options.no_link, since_last=since_last) if not found: print('No submissions were found.') return 1 srs.process_submitters() if options.commenters > 0: srs.process_commenters() if options.output: srs.save_csv(options.output) srs.publish_results(submission_reddit, options.submitters, options.commenters, 5, 5, options.top, options.debug)
def check_for_updates(options): """Check for package updates.""" if not options.disable_update_check: # Check for updates update_check('prawtools', __version__)
def test_update_check__unsuccessful(mock_get, capsys): mock_get.side_effect = requests.exceptions.RequestException update_check(PACKAGE, "0.0.1", bypass_cache=True) assert "" == capsys.readouterr().err
def test_update_check__successful__has_update(mock_get, capsys): mock_response(mock_get.return_value) update_check(PACKAGE, "0.0.1", bypass_cache=True) assert ("Version 0.0.1 of praw is outdated. Version 5.0.0 is available.\n" == capsys.readouterr().err)
def test_update_check__successful__has_no_update(mock_get, capsys): mock_response(mock_get.return_value, "0.0.2") update_check(PACKAGE, "0.0.2", bypass_cache=True) assert "" == capsys.readouterr().err
def __init__(self, generations=100, population_size=100, offspring_size=None, mutation_rate=0.9, crossover_rate=0.1, scoring=None, cv=5, subsample=1.0, n_jobs=1, max_time_mins=None, max_eval_time_mins=5, random_state=None, config_dict=None, warm_start=False, verbosity=0, disable_update_check=False): """Set up the genetic programming algorithm for pipeline optimization. Parameters ---------- generations: int, optional (default: 100) Number of iterations to the run pipeline optimization process. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. population_size: int, optional (default: 100) Number of individuals to retain in the GP population every generation. Generally, TPOT will work better when you give it more individuals (and therefore time) to optimize the pipeline. TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. offspring_size: int, optional (default: None) Number of offspring to produce in each GP generation. By default, offspring_size = population_size. mutation_rate: float, optional (default: 0.9) Mutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate: float, optional (default: 0.1) Crossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to "breed" every generation. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. scoring: string or callable, optional Function used to evaluate the quality of a given pipeline for the problem. By default, accuracy is used for classification problems and mean squared error (MSE) for regression problems. Offers the same options as sklearn.model_selection.cross_val_score as well as a built-in score 'balanced_accuracy'. Classification metrics: ['accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc'] Regression metrics: ['neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'r2'] If you would like to use a custom scoring function, you can pass a callable function to this parameter with the signature scorer(y_true, y_pred). See the section on scoring functions in the documentation for more details. TPOT assumes that any custom scoring function with "error" or "loss" in the name is meant to be minimized, whereas any other functions will be maximized. cv: int or cross-validation generator, optional (default: 5) If CV is a number, then it is the number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process. If it is an object then it is an object to be used as a cross-validation generator. subsample: float, optional (default: 1.0) Subsample ratio of the training instance. Setting it to 0.5 means that TPOT randomly collects half of training samples for pipeline optimization process. n_jobs: int, optional (default: 1) Number of CPUs for evaluating pipelines in parallel during the TPOT optimization process. Assigning this to -1 will use as many cores as available on the computer. max_time_mins: int, optional (default: None) How many minutes TPOT has to optimize the pipeline. If provided, this setting will override the "generations" parameter and allow TPOT to run until it runs out of time. max_eval_time_mins: int, optional (default: 5) How many minutes TPOT has to optimize a single pipeline. Setting this parameter to higher values will allow TPOT to explore more complex pipelines, but will also allow TPOT to run longer. random_state: int, optional (default: None) Random number generator seed for TPOT. Use this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict: a Python dictionary or string, optional (default: None) Python dictionary: A dictionary customizing the operators and parameters that TPOT uses in the optimization process. For examples, see config_regressor.py and config_classifier.py Path for configuration file: A path to a configuration file for customizing the operators and parameters that TPOT uses in the optimization process. For examples, see config_regressor.py and config_classifier.py String 'TPOT light': TPOT uses a light version of operator configuration dictionary instead of the default one. String 'TPOT MDR': TPOT uses a list of TPOT-MDR operator configuration dictionary instead of the default one. warm_start: bool, optional (default: False) Flag indicating whether the TPOT instance will reuse the population from previous calls to fit(). verbosity: int, optional (default: 0) How much information TPOT communicates while it's running. 0 = none, 1 = minimal, 2 = high, 3 = all. A setting of 2 or higher will add a progress bar during the optimization procedure. disable_update_check: bool, optional (default: False) Flag indicating whether the TPOT version checker should be disabled. Returns ------- None """ if self.__class__.__name__ == 'TPOTBase': raise RuntimeError('Do not instantiate the TPOTBase class directly; use TPOTRegressor or TPOTClassifier instead.') # Prompt the user if their version is out of date self.disable_update_check = disable_update_check if not self.disable_update_check: update_check('tpot', __version__) self._pareto_front = None self._optimized_pipeline = None self.fitted_pipeline_ = None self._fitted_imputer = None self._pop = None self.warm_start = warm_start self.population_size = population_size self.generations = generations self.max_time_mins = max_time_mins self.max_eval_time_mins = max_eval_time_mins # Set offspring_size equal to population_size by default if offspring_size: self.offspring_size = offspring_size else: self.offspring_size = population_size self._setup_config(config_dict) self.operators = [] self.arguments = [] for key in sorted(self.config_dict.keys()): op_class, arg_types = TPOTOperatorClassFactory( key, self.config_dict[key], BaseClass=Operator, ArgBaseClass=ARGType ) if op_class: self.operators.append(op_class) self.arguments += arg_types # Schedule TPOT to run for many generations if the user specifies a # run-time limit TPOT will automatically interrupt itself when the timer # runs out if max_time_mins is not None: self.generations = 1000000 self.mutation_rate = mutation_rate self.crossover_rate = crossover_rate if self.mutation_rate + self.crossover_rate > 1: raise ValueError( 'The sum of the crossover and mutation probabilities must be <= 1.0.' ) self.verbosity = verbosity self.operators_context = { 'make_pipeline': make_pipeline, 'make_union': make_union, 'StackingEstimator': StackingEstimator, 'FunctionTransformer': FunctionTransformer, 'copy': copy } self._pbar = None # Dictionary of individuals that have already been evaluated in previous # generations self.evaluated_individuals_ = {} self.random_state = random_state # If the user passed a custom scoring function, store it in the sklearn # SCORERS dictionary if scoring: if hasattr(scoring, '__call__'): scoring_name = scoring.__name__ greater_is_better = 'loss' not in scoring_name and 'error' not in scoring_name SCORERS[scoring_name] = make_scorer(scoring, greater_is_better=greater_is_better) self.scoring_function = scoring_name else: if scoring not in SCORERS: raise ValueError( 'The scoring function {} is not available. Please ' 'choose a valid scoring function from the TPOT ' 'documentation.'.format(scoring) ) self.scoring_function = scoring self.cv = cv self.subsample = subsample if self.subsample <= 0.0 or self.subsample > 1.0: raise ValueError( 'The subsample ratio of the training instance must be in the range (0.0, 1.0].' ) # If the OS is windows, reset cpu number to 1 since the OS did not have multiprocessing module if sys.platform.startswith('win') and n_jobs != 1: print( 'Warning: Although parallelization is currently supported in ' 'TPOT for Windows, pressing Ctrl+C will freeze the optimization ' 'process without saving the best pipeline! Thus, Please DO NOT ' 'press Ctrl+C during the optimization procss if n_jobs is not ' 'equal to 1. For quick test in Windows, please set n_jobs to 1 ' 'for saving the best pipeline in the middle of the optimization ' 'process via Ctrl+C.' ) if n_jobs == -1: self.n_jobs = cpu_count() else: self.n_jobs = n_jobs self._setup_pset() self._setup_toolbox()
def __init__(self, population_size=100, generations=100, mutation_rate=0.9, crossover_rate=0.05, random_state=None, verbosity=0, scoring_function=None, num_cv_folds=3, disable_update_check=False): """Sets up the genetic programming algorithm for pipeline optimization. Parameters ---------- population_size: int (default: 100) The number of pipelines in the genetic algorithm population. Must be > 0.The more pipelines in the population, the slower TPOT will run, but it's also more likely to find better pipelines. generations: int (default: 100) The number of generations to run pipeline optimization for. Must be > 0. The more generations you give TPOT to run, the longer it takes, but it's also more likely to find better pipelines. mutation_rate: float (default: 0.9) The mutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This tells the genetic programming algorithm how many pipelines to apply random changes to every generation. We don't recommend that you tweak this parameter unless you know what you're doing. crossover_rate: float (default: 0.05) The crossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This tells the genetic programming algorithm how many pipelines to "breed" every generation. We don't recommend that you tweak this parameter unless you know what you're doing. random_state: int (default: 0) The random number generator seed for TPOT. Use this to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. verbosity: int (default: 0) How much information TPOT communicates while it's running. 0 = none, 1 = minimal, 2 = all scoring_function: str (default: balanced accuracy) Function used to evaluate the goodness of a given pipeline for the classification problem. By default, balanced class accuracy is used. TPOT assumes that this scoring function should be maximized, i.e., higher is better. Offers the same options as sklearn.cross_validation.cross_val_score: ['accuracy', 'adjusted_rand_score', 'average_precision', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'log_loss', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc'] num_cv_folds: int (default: 3) The number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT pipeline optimization process disable_update_check: bool (default: False) Flag indicating whether the TPOT version checker should be disabled. Returns ------- None """ # Save params to be recalled later by get_params() self.params = locals() # Must be before any local variable definitions self.params.pop('self') # Prompt the user if their version is out of date if not disable_update_check: update_check('tpot', __version__) self.hof = None self._optimized_pipeline = None self._fitted_pipeline = None self.population_size = population_size self.generations = generations self.mutation_rate = mutation_rate self.crossover_rate = crossover_rate self.verbosity = verbosity self.operators_context = { 'make_pipeline': make_pipeline, 'make_union': make_union, 'VotingClassifier': VotingClassifier, 'FunctionTransformer': FunctionTransformer } self.pbar = None self.gp_generation = 0 self.random_state = random_state if scoring_function is None: self.scoring_function = self._balanced_accuracy else: self.scoring_function = scoring_function self.num_cv_folds = num_cv_folds self._setup_pset() self._setup_toolbox()
def main(): """Provide the entry point into the reddit_alert program.""" usage = 'Usage: %prog [options] KEYWORD...' parser = arg_parser(usage=usage) parser.add_option('-s', '--subreddit', action='append', help=('When at least one `-s` option is provided ' '(multiple can be) only alert for comments in the ' 'indicated subreddit(s).')) parser.add_option('-I', '--ignore-user', action='append', metavar='USER', help=('Ignore comments from the provided user. Can be ' 'supplied multiple times.')) parser.add_option('-m', '--message', metavar='USER', help=('When set, send a reddit message to USER with the ' 'alert. Requires the alert script to login.')) options, args = parser.parse_args() if not args: parser.error('At least one KEYWORD must be provided.') # Create the reddit session, and login if necessary session = praw.Reddit('reddit_alert (prawtools {0})'.format(__version__), site_name=options.site, disable_update_check=True) if options.message: session.login(options.user, options.pswd) msg_to = session.get_redditor(options.message) # Check for updates if not options.disable_update_check: update_check('prawtools', __version__) # Build regex args = [x.lower() for x in args] reg_prefix = r'(?:^|[^a-z])' # Any character (or start) can precede reg_suffix = r'(?:$|[^a-z])' # Any character (or end) can follow regex = re.compile( r'{0}({1}){2}'.format(reg_prefix, '|'.join(args), reg_suffix), re.IGNORECASE) # Determine subreddit or multireddit if options.subreddit: subreddit = '+'.join(sorted(options.subreddit)) else: subreddit = 'all' print('Alerting on:') for item in sorted(args): print(' * {0}'.format(item)) print('using the comment stream: http://www.reddit.com/r/{0}/comments'. format(subreddit)) # Build ignore set if options.ignore_user: ignore_users = set(x.lower() for x in options.ignore_user) else: ignore_users = set() try: for comment in praw.helpers.comment_stream(session, subreddit, verbosity=options.verbose): if comment.author and comment.author.name.lower() in ignore_users: continue match = regex.search(comment.body) if match: keyword = match.group(1).lower() url = quick_url(comment) print('{0}: {1}'.format(keyword, url)) if options.message: msg_to.send_message( 'Reddit Alert: {0}'.format(keyword), '{0}\n\nby /u/{1}\n\n---\n\n{2}'.format( url, comment.author, comment.body)) except KeyboardInterrupt: sys.stderr.write('\n') print('Goodbye!\n')
def __init__(self, population_size=100, generations=100, mutation_rate=0.9, crossover_rate=0.05, scoring=None, num_cv_folds=3, max_time_mins=None, max_eval_time_mins=5, random_state=None, verbosity=0, disable_update_check=False): """Sets up the genetic programming algorithm for pipeline optimization. Parameters ---------- population_size: int (default: 100) The number of pipelines in the genetic algorithm population. Must be > 0.The more pipelines in the population, the slower TPOT will run, but it's also more likely to find better pipelines. generations: int (default: 100) The number of generations to run pipeline optimization for. Must be > 0. The more generations you give TPOT to run, the longer it takes, but it's also more likely to find better pipelines. mutation_rate: float (default: 0.9) The mutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This tells the genetic programming algorithm how many pipelines to apply random changes to every generation. We don't recommend that you tweak this parameter unless you know what you're doing. crossover_rate: float (default: 0.05) The crossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This tells the genetic programming algorithm how many pipelines to "breed" every generation. We don't recommend that you tweak this parameter unless you know what you're doing. scoring: function or str Function used to evaluate the quality of a given pipeline for the problem. By default, balanced class accuracy is used for classification problems, mean squared error for regression problems. TPOT assumes that this scoring function should be maximized, i.e., higher is better. Offers the same options as sklearn.model_selection.cross_val_score: ['accuracy', 'adjusted_rand_score', 'average_precision', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc'] num_cv_folds: int (default: 3) The number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT pipeline optimization process max_time_mins: int (default: None) How many minutes TPOT has to optimize the pipeline. If not None, this setting will override the `generations` parameter. max_eval_time_mins: int (default: 5) How many minutes TPOT has to optimize a single pipeline. Setting this parameter to higher values will allow TPOT to explore more complex pipelines but will also allow TPOT to run longer. random_state: int (default: 0) The random number generator seed for TPOT. Use this to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. verbosity: int (default: 0) How much information TPOT communicates while it's running. 0 = none, 1 = minimal, 2 = all disable_update_check: bool (default: False) Flag indicating whether the TPOT version checker should be disabled. Returns ------- None """ if self.__class__.__name__ == 'TPOTBase': raise RuntimeError( 'Do not instantiate the TPOTBase class directly; ' 'use TPOTRegressor or TPOTClassifier instead.') # Prompt the user if their version is out of date self.disable_update_check = disable_update_check if not self.disable_update_check: update_check('tpot', __version__) self._hof = None self._optimized_pipeline = None self._fitted_pipeline = None self.population_size = population_size self.generations = generations self.max_time_mins = max_time_mins self.max_eval_time_mins = max_eval_time_mins # Schedule TPOT to run for a very long time if the user specifies a run-time # limit TPOT will automatically interrupt itself when the timer runs out if not (max_time_mins is None): self.generations = 1000000 self.mutation_rate = mutation_rate self.crossover_rate = crossover_rate self.verbosity = verbosity self.operators_context = { 'make_pipeline': make_pipeline, 'make_union': make_union, 'VotingClassifier': VotingClassifier, 'FunctionTransformer': FunctionTransformer } self._pbar = None self._gp_generation = 0 self.random_state = random_state # If the user passed a custom scoring function, store it in the sklearn SCORERS dictionary if scoring: if hasattr(scoring, '__call__'): scoring_name = scoring.__name__ if 'loss' in scoring_name or 'error' in scoring_name: greater_is_better = False else: greater_is_better = True SCORERS[scoring_name] = make_scorer( scoring, greater_is_better=greater_is_better) self.scoring_function = scoring_name else: self.scoring_function = scoring self.num_cv_folds = num_cv_folds self._setup_pset() self._setup_toolbox()
def __init__(self, generations=100, population_size=100, offspring_size=None, mutation_rate=0.9, crossover_rate=0.1, scoring=None, cv=5, n_jobs=1, max_time_mins=None, max_eval_time_mins=5, random_state=None, config_dict=None, warm_start=False, verbosity=0, disable_update_check=False): """Sets up the genetic programming algorithm for pipeline optimization. Parameters ---------- generations: int (default: 100) Number of iterations to the run pipeline optimization process. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. population_size: int (default: 100) Number of individuals to retain in the GP population every generation. Generally, TPOT will work better when you give it more individuals (and therefore time) to optimize the pipeline. TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. offspring_size: int (default: None) Number of offspring to produce in each GP generation. By default, offspring_size = population_size. mutation_rate: float (default: 0.9) Mutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate: float (default: 0.1) Crossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to "breed" every generation. We recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. scoring: function or str Function used to evaluate the quality of a given pipeline for the problem. By default, accuracy is used for classification problems and mean squared error (mse) for regression problems. TPOT assumes that this scoring function should be maximized, i.e., higher is better. Offers the same options as sklearn.model_selection.cross_val_score as well as a built-in score "balanced_accuracy": ['accuracy', 'adjusted_rand_score', 'average_precision', 'balanced accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc'] cv: int (default: 5) Number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process. n_jobs: int (default: 1) Number of CPUs for evaluating pipelines in parallel during the TPOT optimization process. Assigning this to -1 will use as many cores as available on the computer. max_time_mins: int (default: None) How many minutes TPOT has to optimize the pipeline. If provided, this setting will override the "generations" parameter and allow TPOT to run until it runs out of time. max_eval_time_mins: int (default: 5) How many minutes TPOT has to optimize a single pipeline. Setting this parameter to higher values will allow TPOT to explore more complex pipelines, but will also allow TPOT to run longer. random_state: int (default: None) Random number generator seed for TPOT. Use this to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict: string (default: None) Path for configuration file: A path to a configuration file for customizing the operators and parameters that TPOT uses in the optimization process. For examples, see config_regressor.py and config_classifier.py String 'TPOT light': TPOT uses a light version of operator configuration dictionary instead of the default one. String 'TPOT MDR': TPOT uses a list of TPOT-MDR operator configuration dictionary instead of the default one. warm_start: bool (default: False) Flag indicating whether the TPOT instance will reuse the population from previous calls to fit(). verbosity: int (default: 0) How much information TPOT communicates while it's running. 0 = none, 1 = minimal, 2 = high, 3 = all. A setting of 2 or higher will add a progress bar during the optimization procedure. disable_update_check: bool (default: False) Flag indicating whether the TPOT version checker should be disabled. Returns ------- None """ if self.__class__.__name__ == 'TPOTBase': raise RuntimeError( 'Do not instantiate the TPOTBase class directly; use TPOTRegressor or TPOTClassifier instead.' ) # Prompt the user if their version is out of date self.disable_update_check = disable_update_check if not self.disable_update_check: update_check('tpot', __version__) self._pareto_front = None self._optimized_pipeline = None self._fitted_pipeline = None self._pop = None self.warm_start = warm_start self.population_size = population_size self.generations = generations self.max_time_mins = max_time_mins self.max_eval_time_mins = max_eval_time_mins # Set offspring_size equal to population_size by default if offspring_size: self.offspring_size = offspring_size else: self.offspring_size = population_size if config_dict: if config_dict == 'TPOT light': if self.classification: self.config_dict = classifier_config_dict_light else: self.config_dict = regressor_config_dict_light elif config_dict == 'TPOT MDR': if self.classification: self.config_dict = tpot_mdr_classifier_config_dict else: raise TypeError( 'The TPOT MDR operator configuration file does not currently ' 'work with TPOTRegressor. Please use TPOTClassifier instead.' ) else: try: with open(config_dict, 'r') as input_file: file_string = input_file.read() operator_dict = eval(file_string[file_string.find('{'):( file_string.rfind('}') + 1)]) except: raise TypeError( 'The operator configuration file is in a bad format or not available. ' 'Please check the configuration file before running TPOT.' ) else: self.config_dict = self.default_config_dict self.operators = [] self.arguments = [] for key in sorted(self.config_dict.keys()): op_class, arg_types = TPOTOperatorClassFactory( key, self.config_dict[key], BaseClass=Operator, ArgBaseClass=ARGType) if op_class: self.operators.append(op_class) self.arguments += arg_types # Schedule TPOT to run for many generations if the user specifies a run-time limit # TPOT will automatically interrupt itself when the timer runs out if not (max_time_mins is None): self.generations = 1000000 self.mutation_rate = mutation_rate self.crossover_rate = crossover_rate if self.mutation_rate + self.crossover_rate > 1: raise ValueError( 'The sum of the crossover and mutation probabilities must be <= 1.0.' ) self.verbosity = verbosity self.operators_context = { 'make_pipeline': make_pipeline, 'make_union': make_union, 'VotingClassifier': VotingClassifier, 'FunctionTransformer': FunctionTransformer, 'copy': copy } self._pbar = None # Dictionary of individuals that have already been evaluated in previous generations self._evaluated_individuals = {} self.random_state = random_state # If the user passed a custom scoring function, store it in the sklearn SCORERS dictionary if scoring: if hasattr(scoring, '__call__'): scoring_name = scoring.__name__ if 'loss' in scoring_name or 'error' in scoring_name: greater_is_better = False else: greater_is_better = True SCORERS[scoring_name] = make_scorer( scoring, greater_is_better=greater_is_better) self.scoring_function = scoring_name else: if scoring not in SCORERS: raise ValueError( 'The scoring function {} is not available. ' 'Please choose a valid scoring function from the TPOT ' 'documentation.'.format(scoring)) self.scoring_function = scoring self.cv = cv # If the OS is windows, reset cpu number to 1 since the OS did not have multiprocessing module if sys.platform.startswith('win') and n_jobs != 1: print( 'Warning: Although parallelization is currently supported in TPOT for Windows, ' 'pressing Ctrl+C will freeze the optimization process without saving the best pipeline!' 'Thus, Please DO NOT press Ctrl+C during the optimization procss if n_jobs is not equal to 1.' 'For quick test in Windows, please set n_jobs to 1 for saving the best pipeline ' 'in the middle of the optimization process via Ctrl+C.') if n_jobs == -1: self.n_jobs = cpu_count() else: self.n_jobs = n_jobs self._setup_pset() self._setup_toolbox()
def main(): """Provide the entry point in the the modutils command.""" mod_choices = ("banned", "contributors", "moderators") mod_choices_dsp = ", ".join(["`%s`" % x for x in mod_choices]) msg = { "add": ("Add users to one of the following categories: %s" % mod_choices_dsp), "clear": "Remove users who have no flair set.", "css": "Ignore the CSS field when synchronizing flair.", "edit": "When adding flair templates, mark them as editable.", "file": "The file containing contents for --message", "flair": "List flair for the subreddit.", "flair_stats": "Display the number of users with each flair.", "json": "Output the results as json. Applies to --flair", "limit": ( "The minimum number of users that must have the specified " "flair in order to add as a template. default: %default" ), "list": ("List the users in one of the following categories: " "%s. May be specified more than once.") % mod_choices_dsp, "msg": ( "Send message to users of one of the following categories: " "%s. Message subject provided via --subject, content provided " "via --file or STDIN." ) % mod_choices_dsp, "sort": ( "The order to add flair templates. Available options are " "`alpha` to add alphabetically, and `size` to first add " "flair that is shared by the most number of users. " "default: %default" ), "static": ( "Add this template when syncing flair templates. When " "syncing text and css use a comma to separate the two." ), "subject": "The subject of the message to send for --message.", "sync": "Synchronize flair templates with current user flair.", "text": "Ignore the text field when synchronizing flair.", } usage = "Usage: %prog [options] SUBREDDIT" parser = arg_parser(usage=usage) parser.add_option("-a", "--add", help=msg["add"]) parser.add_option( "-l", "--list", action="append", help=msg["list"], choices=mod_choices, metavar="CATEGORY", default=[] ) parser.add_option("-c", "--clear-empty", action="store_true", help=msg["clear"]) parser.add_option("-F", "--file", help=msg["file"]) parser.add_option("-f", "--flair", action="store_true", help=msg["flair"]) parser.add_option("", "--flair-stats", action="store_true", help=msg["flair_stats"]) parser.add_option("-m", "--message", choices=mod_choices, help=msg["msg"]) parser.add_option("", "--subject", help=msg["subject"]) group = OptionGroup(parser, "Format options") group.add_option("-j", "--json", action="store_true", help=msg["json"]) parser.add_option_group(group) group = OptionGroup(parser, "Sync options") group.add_option("", "--sync", action="store_true", help=msg["sync"]) group.add_option("-s", "--static", action="append", help=msg["static"]) group.add_option("", "--editable", action="store_true", help=msg["edit"]) group.add_option("", "--ignore-css", action="store_true", default=False, help=msg["css"]) group.add_option("", "--ignore-text", action="store_true", default=False, help=msg["text"]) group.add_option("", "--limit", type="int", help=msg["limit"], default=2) group.add_option("", "--sort", action="store", choices=("alpha", "size"), default="alpha", help=msg["sort"]) parser.add_option_group(group) options, args = parser.parse_args() if options.pswd and not options.user: parser.error("Must provide --user when providing --pswd.") if len(args) == 0: parser.error("Must provide subreddit name.") if options.message and not options.subject: parser.error("Must provide --subject when providing --message.") subreddit = args[0] if not options.disable_update_check: # Check for updates update_check("prawtools", __version__) modutils = ModUtils(subreddit, options.site, options.user, options.pswd, options.verbose) if options.add: modutils.add_users(options.add) if options.clear_empty: modutils.clear_empty() for category in options.list: modutils.output_list(category) if options.flair: modutils.output_current_flair(as_json=options.json) if options.flair_stats: modutils.output_flair_stats() if options.sync: modutils.flair_template_sync( editable=options.editable, limit=options.limit, static=options.static, sort=options.sort, use_css=not options.ignore_css, use_text=not options.ignore_text, ) if options.message: modutils.message(options.message, options.subject, options.file)
def main(): # parse the command-line options and arguments user, target, options = parse_cmd_line() # Check for package updates update_check(__name__, __version__) # open connection to Reddit r = praw.Reddit(user_agent="bot by /u/{0}".format(user), disable_update_check=True) r.config.decode_html_entities = True # run analysis sys.stderr.write("Analyzing {0}\n".format(target)) sys.stderr.flush() target = target[3:] if options.is_subreddit: processSubreddit(subreddit=r.get_subreddit(target), period=options.period, limit=options.limit, count_word_freqs=options.count_word_freqs, max_threshold=options.max_threshold) else: processRedditor(redditor=r.get_redditor(target), limit=options.limit, count_word_freqs=options.count_word_freqs, max_threshold=options.max_threshold) # build a string containing all the words for the word cloud software output = "" # open output file to store the output string outFileName = target + ".csv" if options.is_subreddit: outFileName = "subreddit-" + outFileName else: outFileName = "user-" + outFileName outFile = open(outFileName, "w") # combine singular and plural forms of words into single count for word, count in popularWords.items(): # e.g.: "picture" and "pictures" if word.endswith("s"): # if the singular form of the word was used singular = word[:-1] if popularWords[singular] > 0: # combine the count into the most-used form of the word if popularWords[singular] > count: popularWords[singular] += popularWords[word] del popularWords[word] else: popularWords[word] += popularWords[singular] del popularWords[singular] # e.g.: "furry" and "furries" if word.endswith("ies"): # if the singular form of the word was used singular = word[:-3] + "y" if popularWords[singular] > 0: # combine the count into the most-used form of the word if popularWords[singular] > count: popularWords[singular] += popularWords[word] del popularWords[word] else: popularWords[word] += popularWords[singular] del popularWords[singular] for word in sorted(popularWords, key=popularWords.get, reverse=True): # tweak this number depending on the subreddit # some subreddits end up having TONS of words and it seems to overflow # the Python string buffer if popularWords[word] > 5: pri = True # don't print the word if it's just a number if word.isdigit(): pri = False # add as many copies of the word as it was mentioned in the # subreddit if pri: txt = word + ":" + str(popularWords[word]) + "\n" txt = txt.encode("UTF-8") output += txt outFile.write(txt) outFile.close() # print the series of words for the word cloud software # place this text into wordle.net if options.verbose: print(output) # save the raw word counts to a file if not options.no_raw_data: outFile = open("raw-" + outFileName, "w") for word in sorted(allWords, key=allWords.get, reverse=True): txt = word + ":" + str(allWords[word]) + "\n" txt = txt.encode("UTF-8") outFile.write(txt) outFile.close()
def autoclean(input_dataframe, drop_nans=False, copy=False, encoder=None, encoder_kwargs=None, ignore_update_check=False): """Performs a series of automated data cleaning transformations on the provided data set Parameters ---------- input_dataframe: pandas.DataFrame Data set to clean drop_nans: bool Drop all rows that have a NaN in any column (default: False) copy: bool Make a copy of the data set (default: False) encoder: category_encoders transformer The a valid category_encoders transformer which is passed an inferred cols list. Default (None: LabelEncoder) encoder_kwargs: category_encoders The a valid sklearn transformer to encode categorical features. Default (None) ignore_update_check: bool Do not check for the latest version of datacleaner Returns ---------- output_dataframe: pandas.DataFrame Cleaned data set """ global update_checked if ignore_update_check: update_checked = True if not update_checked: update_check('datacleaner', __version__) update_checked = True if copy: input_dataframe = input_dataframe.copy() if drop_nans: input_dataframe.dropna(inplace=True) if encoder_kwargs is None: encoder_kwargs = {} print('columns to clean:') for column in input_dataframe.columns.values: print (column) # Replace NaNs with the median or mode of the column depending on the column type try: input_dataframe[column].fillna(input_dataframe[column].median(), inplace=True) print('median fill is used') except TypeError: most_frequent = input_dataframe[column].mode() # If the mode can't be computed, use the nearest valid value # See https://github.com/rhiever/datacleaner/issues/8 if len(most_frequent) > 0: print('mode fill is used') input_dataframe[column].fillna(input_dataframe[column].mode()[0], inplace=True) else: print('bfill and ffill is used') input_dataframe[column].fillna(method='bfill', inplace=True) input_dataframe[column].fillna(method='ffill', inplace=True) # Encode all strings with numerical equivalents if str(input_dataframe[column].values.dtype) == 'object': if encoder is not None: print('encoder set by the user is used') column_encoder = encoder(**encoder_kwargs).fit(input_dataframe[column].values) else: print('default encoding method is used') column_encoder = LabelEncoder().fit(input_dataframe[column].values) input_dataframe[column] = column_encoder.transform(input_dataframe[column].values) print ('Done!') return input_dataframe
def main(): # parse the command-line options and arguments user, target, options = parse_cmd_line() # Check for package updates update_check(__name__, __version__) # open connection to Reddit handler = None if options.multiprocess: from praw.handlers import MultiprocessHandler handler = MultiprocessHandler() reddit = praw.Reddit( user_agent="/u/{0} reddit analyzer".format(user), handler=handler) reddit.config.decode_html_entities = True # run analysis sys.stderr.write("Analyzing {0}\n".format(target)) sys.stderr.flush() target = target[3:] if options.is_subreddit: process_subreddit(subreddit=reddit.get_subreddit(target), period=options.period, limit=options.limit, count_word_freqs=options.count_word_freqs, max_threshold=options.max_threshold) else: process_redditor(redditor=reddit.get_redditor(target), limit=options.limit, count_word_freqs=options.count_word_freqs, max_threshold=options.max_threshold) # build a string containing all the words for the word cloud software output = "" # open output file to store the output string out_file_name = "{0}.csv".format(target) if options.is_subreddit: out_file_name = "subreddit-{0}".format(out_file_name) else: out_file_name = "user-{0}".format(out_file_name) out_file = open(out_file_name, "w") # combine singular and plural forms of words into single count for word in list(popular_words.keys()): count = popular_words[word] # e.g.: "picture" and "pictures" if word.endswith("s"): # if the singular form of the word was used singular = word[:-1] if popular_words[singular] > 0: # combine the count into the most-used form of the word if popular_words[singular] > count: popular_words[singular] += popular_words[word] del popular_words[word] else: popular_words[word] += popular_words[singular] del popular_words[singular] # e.g.: "furry" and "furries" if word.endswith("ies"): # if the singular form of the word was used singular = "{0}y".format(word[:-3]) if popular_words[singular] > 0: # combine the count into the most-used form of the word if popular_words[singular] > count: popular_words[singular] += popular_words[word] del popular_words[word] else: popular_words[word] += popular_words[singular] del popular_words[singular] for word in sorted(popular_words, key=popular_words.get, reverse=True): # tweak this number depending on the subreddit # some subreddits end up having TONS of words and it seems to overflow # the Python string buffer if popular_words[word] > 5: pri = True # don't print the word if it's just a number if word.isdigit(): pri = False # add as many copies of the word as it was mentioned in the # subreddit if pri: out_text = str("{0}:{1}\n".format(word, popular_words[word])) output += out_text out_file.write(out_text) out_file.close() # print the series of words for the word cloud software # place this text into wordle.net if options.verbose: print(output) # save the raw word counts to a file if not options.no_raw_data: out_file = open("raw-{0}".format(out_file_name), "w") for word in sorted(all_words, key=all_words.get, reverse=True): out_text = str("{0}:{1}\n".format(word, all_words[word])) out_file.write(out_text) out_file.close()
def main(): """Provide the entry point in the the modutils command.""" mod_choices = ('banned', 'contributors', 'moderators') mod_choices_dsp = ', '.join(['`%s`' % x for x in mod_choices]) msg = { 'add': ('Add users to one of the following categories: %s' % mod_choices_dsp), 'clear': 'Remove users who have no flair set.', 'css': 'Ignore the CSS field when synchronizing flair.', 'edit': 'When adding flair templates, mark them as editable.', 'file': 'The file containing contents for --message', 'flair': 'List flair for the subreddit.', 'flair_stats': 'Display the number of users with each flair.', 'json': 'Output the results as json. Applies to --flair', 'limit': ('The minimum number of users that must have the specified ' 'flair in order to add as a template. default: %default'), 'list': ('List the users in one of the following categories: ' '%s. May be specified more than once.') % mod_choices_dsp, 'msg': ('Send message to users of one of the following categories: ' '%s. Message subject provided via --subject, content provided ' 'via --file or STDIN.') % mod_choices_dsp, 'sort': ('The order to add flair templates. Available options are ' '`alpha` to add alphabetically, and `size` to first add ' 'flair that is shared by the most number of users. ' 'default: %default'), 'static': ('Add this template when syncing flair templates. When ' 'syncing text and css use a comma to separate the two.'), 'subject': 'The subject of the message to send for --message.', 'sync': 'Synchronize flair templates with current user flair.', 'text': 'Ignore the text field when synchronizing flair.'} usage = 'Usage: %prog [options] SUBREDDIT' parser = arg_parser(usage=usage) parser.add_option('-a', '--add', help=msg['add']) parser.add_option('-l', '--list', action='append', help=msg['list'], choices=mod_choices, metavar='CATEGORY', default=[]) parser.add_option('-c', '--clear-empty', action='store_true', help=msg['clear']) parser.add_option('-F', '--file', help=msg['file']) parser.add_option('-f', '--flair', action='store_true', help=msg['flair']) parser.add_option('', '--flair-stats', action='store_true', help=msg['flair_stats']) parser.add_option('-m', '--message', choices=mod_choices, help=msg['msg']) parser.add_option('', '--subject', help=msg['subject']) group = OptionGroup(parser, 'Format options') group.add_option('-j', '--json', action='store_true', help=msg['json']) parser.add_option_group(group) group = OptionGroup(parser, 'Sync options') group.add_option('', '--sync', action='store_true', help=msg['sync']) group.add_option('-s', '--static', action='append', help=msg['static']) group.add_option('', '--editable', action='store_true', help=msg['edit']) group.add_option('', '--ignore-css', action='store_true', default=False, help=msg['css']) group.add_option('', '--ignore-text', action='store_true', default=False, help=msg['text']) group.add_option('', '--limit', type='int', help=msg['limit'], default=2) group.add_option('', '--sort', action='store', choices=('alpha', 'size'), default='alpha', help=msg['sort']) parser.add_option_group(group) options, args = parser.parse_args() if options.pswd and not options.user: parser.error('Must provide --user when providing --pswd.') if len(args) == 0: parser.error('Must provide subreddit name.') if options.message and not options.subject: parser.error('Must provide --subject when providing --message.') subreddit = args[0] if not options.disable_update_check: # Check for updates update_check('prawtools', __version__) modutils = ModUtils(subreddit, options.site, options.user, options.pswd, options.verbose) if options.add: modutils.add_users(options.add) if options.clear_empty: modutils.clear_empty() for category in options.list: modutils.output_list(category) if options.flair: modutils.output_current_flair(as_json=options.json) if options.flair_stats: modutils.output_flair_stats() if options.sync: modutils.flair_template_sync(editable=options.editable, limit=options.limit, static=options.static, sort=options.sort, use_css=not options.ignore_css, use_text=not options.ignore_text) if options.message: modutils.message(options.message, options.subject, options.file)
def xrff2csv(input_filename, output_filename=None, sep='\t', ignore_update_check=False): """Converts the provided XRFF file to CSV format If `output_filename` is not specified, the function will print the result Parameters ---------- input_filename: str Name of the XRFF file to convert output_filename: str Name of the CSV file to output to (default: None) sep: str String to use as the separator in the CSV file (default: \t) ignore_update_check: bool Do not check for the latest version of xrff2csv Returns ---------- None """ global update_checked if ignore_update_check: update_checked = True if not update_checked: update_check('xrff2csv', __version__) update_checked = True with open(input_filename, 'r') as in_file: headers = [] values = [] for line in in_file: if 'attribute name=' in line: headers.append(line.split('"')[1]) elif '<body>' in line: # Beginning of data entries has been encountered, so output the headers headers = sep.join(headers) if output_filename is not None: with open(output_filename, 'w') as out_file: out_file.write(headers) else: sys.stdout.write(headers) # No need to store the headers in memory any more del headers elif '<value>' in line: values.append(line.split('>')[1].split('<')[0]) elif '</instance>' in line: # End of data instance reached, so output it values = os.linesep + sep.join(values) if output_filename is not None: with open(output_filename, 'a') as out_file: out_file.write(values) else: sys.stdout.write(values) # No need to store this data instance in memory any more values = [] if output_filename is not None: with open(output_filename, 'a') as out_file: out_file.write(os.linesep) else: sys.stdout.write(os.linesep)
def autoclean_cv(training_dataframe, testing_dataframe, drop_nans=False, copy=False, encoder=None, encoder_kwargs=None, ignore_update_check=False): """Performs a series of automated data cleaning transformations on the provided training and testing data sets Unlike `autoclean()`, this function takes cross-validation into account by learning the data transformations from only the training set, then applying those transformations to both the training and testing set. By doing so, this function will prevent information leak from the training set into the testing set. Parameters ---------- training_dataframe: pandas.DataFrame Training data set testing_dataframe: pandas.DataFrame Testing data set drop_nans: bool Drop all rows that have a NaN in any column (default: False) copy: bool Make a copy of the data set (default: False) encoder: category_encoders transformer The a valid category_encoders transformer which is passed an inferred cols list. Default (None: LabelEncoder) encoder_kwargs: category_encoders The a valid sklearn transformer to encode categorical features. Default (None) ignore_update_check: bool Do not check for the latest version of datacleaner Returns ---------- output_training_dataframe: pandas.DataFrame Cleaned training data set output_testing_dataframe: pandas.DataFrame Cleaned testing data set """ global update_checked if ignore_update_check: update_checked = True if not update_checked: update_check('datacleaner', __version__) update_checked = True if set(training_dataframe.columns.values) != set(testing_dataframe.columns.values): raise ValueError('The training and testing DataFrames do not have the same columns. ' 'Make sure that you are providing the same columns.') if copy: training_dataframe = training_dataframe.copy() testing_dataframe = testing_dataframe.copy() if drop_nans: training_dataframe.dropna(inplace=True) testing_dataframe.dropna(inplace=True) if encoder_kwargs is None: encoder_kwargs = {} for column in training_dataframe.columns.values: # Replace NaNs with the median or mode of the column depending on the column type try: column_median = training_dataframe[column].median() training_dataframe[column].fillna(column_median, inplace=True) testing_dataframe[column].fillna(column_median, inplace=True) except TypeError: column_mode = training_dataframe[column].mode()[0] training_dataframe[column].fillna(column_mode, inplace=True) testing_dataframe[column].fillna(column_mode, inplace=True) # Encode all strings with numerical equivalents if str(training_dataframe[column].values.dtype) == 'object': if encoder is not None: column_encoder = encoder(**encoder_kwargs).fit(training_dataframe[column].values) else: column_encoder = LabelEncoder().fit(training_dataframe[column].values) training_dataframe[column] = column_encoder.transform(training_dataframe[column].values) testing_dataframe[column] = column_encoder.transform(testing_dataframe[column].values) return training_dataframe, testing_dataframe
def main(): """Provide the entry point to the subreddit_stats command. :returns: 0 on success, 1 otherwise """ parser = arg_parser(usage='usage: %prog [options] [SUBREDDIT]') parser.add_option('-s', '--submitters', type='int', default=5, help='Number of top submitters to display ' '[default %default]') parser.add_option('-c', '--commenters', type='int', default=10, help='Number of top commenters to display ' '[default %default]') parser.add_option('-a', '--after', help='Submission ID to fetch after') parser.add_option('-d', '--days', type='int', default=32, help=('Number of previous days to include submissions ' 'from. Use 0 for unlimited. Default: %default')) parser.add_option('-D', '--debug', action='store_true', help='Enable debugging mode. Does not post stats.') parser.add_option('-R', '--submission-reddit', help=('Subreddit to submit to. If not present, ' 'submits to the subreddit processed')) parser.add_option('-t', '--top', help=('Run on top submissions either by day, week, ' 'month, year, or all')) parser.add_option('', '--distinguished', action='store_true', help=('Include distinguished subissions and ' 'comments (default: False). Note that regular ' 'comments of distinguished submissions will still ' 'be included.')) parser.add_option('', '--no-self', action='store_true', help=('Do not include self posts (and their comments) in' ' the calculation.')) parser.add_option('', '--no-link', action='store_true', help=('Only include self posts (and their comments) in ' 'the calculation.')) parser.add_option('', '--prev', help='Statically provide the URL of previous SRS page.') parser.add_option('', '--include-prev', action='store_true', help='Don\'t try to avoid overlap with a previous SRS.') parser.add_option('-o', '--output', help='Save result csv to named file.') options, args = parser.parse_args() if len(args) != 1: sys.stdout.write('Enter subreddit name: ') sys.stdout.flush() subject_reddit = sys.stdin.readline().strip() if not subject_reddit: parser.error('No subreddit name entered') else: subject_reddit = args[0] if not options.disable_update_check: # Check for updates update_check('prawtools', __version__) print('You chose to analyze this subreddit: {0}'.format(subject_reddit)) if options.no_link and options.no_self: parser.error('You are choosing to exclude self posts but also only ' 'include self posts. Consider checking your arguments.') if options.submission_reddit: submission_reddit = options.submission_reddit else: submission_reddit = subject_reddit srs = SubRedditStats(subject_reddit, options.site, options.verbose, options.distinguished) srs.login(options.user, options.pswd) if options.prev: srs.prev_stat(options.prev) if options.top: found = srs.fetch_top_submissions(options.top, options.no_self, options.no_link) else: since_last = not options.include_prev found = srs.fetch_recent_submissions(max_duration=options.days, after=options.after, exclude_self=options.no_self, exclude_link=options.no_link, since_last=since_last) if not found: print('No submissions were found.') return 1 srs.process_submitters() if options.commenters > 0: srs.process_commenters() if options.output: srs.save_csv(options.output) srs.publish_results(submission_reddit, options.submitters, options.commenters, 5, 5, options.top, options.debug)
def main(): """Provide the entry point in the the modutils command.""" mod_choices = ('banned', 'contributors', 'moderators') mod_choices_dsp = ', '.join(['`%s`' % x for x in mod_choices]) msg = { 'add': ('Add users to one of the following categories: %s' % mod_choices_dsp), 'clear': 'Remove users who have no flair set.', 'css': 'Ignore the CSS field when synchronizing flair.', 'edit': 'When adding flair templates, mark them as editable.', 'file': 'The file containing contents for --message', 'flair': 'List flair for the subreddit.', 'flair_stats': 'Display the number of users with each flair.', 'json': 'Output the results as json. Applies to --flair', 'limit': ('The minimum number of users that must have the specified ' 'flair in order to add as a template. default: %default'), 'list': ('List the users in one of the following categories: ' '%s. May be specified more than once.') % mod_choices_dsp, 'msg': ('Send message to users of one of the following categories: ' '%s. Message subject provided via --subject, content provided ' 'via --file or STDIN.') % mod_choices_dsp, 'sort': ('The order to add flair templates. Available options are ' '`alpha` to add alphabetically, and `size` to first add ' 'flair that is shared by the most number of users. ' 'default: %default'), 'static': ('Add this template when syncing flair templates. When ' 'syncing text and css use a comma to separate the two.'), 'subject': 'The subject of the message to send for --message.', 'sync': 'Synchronize flair templates with current user flair.', 'text': 'Ignore the text field when synchronizing flair.' } usage = 'Usage: %prog [options] SUBREDDIT' parser = arg_parser(usage=usage) parser.add_option('-a', '--add', help=msg['add']) parser.add_option('-l', '--list', action='append', help=msg['list'], choices=mod_choices, metavar='CATEGORY', default=[]) parser.add_option('-c', '--clear-empty', action='store_true', help=msg['clear']) parser.add_option('-F', '--file', help=msg['file']) parser.add_option('-f', '--flair', action='store_true', help=msg['flair']) parser.add_option('', '--flair-stats', action='store_true', help=msg['flair_stats']) parser.add_option('-m', '--message', choices=mod_choices, help=msg['msg']) parser.add_option('', '--subject', help=msg['subject']) group = OptionGroup(parser, 'Format options') group.add_option('-j', '--json', action='store_true', help=msg['json']) parser.add_option_group(group) group = OptionGroup(parser, 'Sync options') group.add_option('', '--sync', action='store_true', help=msg['sync']) group.add_option('-s', '--static', action='append', help=msg['static']) group.add_option('', '--editable', action='store_true', help=msg['edit']) group.add_option('', '--ignore-css', action='store_true', default=False, help=msg['css']) group.add_option('', '--ignore-text', action='store_true', default=False, help=msg['text']) group.add_option('', '--limit', type='int', help=msg['limit'], default=2) group.add_option('', '--sort', action='store', choices=('alpha', 'size'), default='alpha', help=msg['sort']) parser.add_option_group(group) options, args = parser.parse_args() if options.pswd and not options.user: parser.error('Must provide --user when providing --pswd.') if len(args) == 0: parser.error('Must provide subreddit name.') if options.message and not options.subject: parser.error('Must provide --subject when providing --message.') subreddit = args[0] if not options.disable_update_check: # Check for updates update_check('prawtools', __version__) modutils = ModUtils(subreddit, options.site, options.user, options.pswd, options.verbose) if options.add: modutils.add_users(options.add) if options.clear_empty: modutils.clear_empty() for category in options.list: modutils.output_list(category) if options.flair: modutils.output_current_flair(as_json=options.json) if options.flair_stats: modutils.output_flair_stats() if options.sync: modutils.flair_template_sync(editable=options.editable, limit=options.limit, static=options.static, sort=options.sort, use_css=not options.ignore_css, use_text=not options.ignore_text) if options.message: modutils.message(options.message, options.subject, options.file)