Пример #1
0
    def __init__(self, user_agent, site_name=None, disable_update_check=False):
        """
        Initialize our connection with a reddit.

        The user_agent is how your application identifies itself. Read the
        official API guidelines for user_agents
        https://github.com/reddit/reddit/wiki/API.  Applications using default
        user_agents such as "Python/urllib" are drastically limited.

        site_name allows you to specify which reddit you want to connect to.
        The installation defaults are reddit.com, if you only need to connect
        to reddit.com then you can safely ignore this. If you want to connect
        to another reddit, set site_name to the name of that reddit. This must
        match with an entry in praw.ini. If site_name is None, then the site
        name will be looked for in the environment variable REDDIT_SITE. If it
        is not found there, the default site name reddit matching reddit.com
        will be used.

        disable_update_check allows you to prevent an update check from
        occuring in spite of the check_for_updates setting in praw.ini.
        """
        if not user_agent or not isinstance(user_agent, six.string_types):
            raise TypeError('User agent must be a non-empty string.')

        self.DEFAULT_HEADERS['User-agent'] = UA_STRING % user_agent
        self.config = Config(site_name or os.getenv('REDDIT_SITE') or 'reddit')
        self.http = requests.session()
        self.modhash = self.user = None

        # Check for updates if permitted and this is the first Reddit instance
        if not disable_update_check and not self.update_checked \
                and self.config.check_for_updates:
            update_check(__name__, __version__)
            self.update_checked = True
Пример #2
0
def autoclean(input_dataframe, drop_nans=False, copy=False, encoder=None,
              encoder_kwargs=None, ignore_update_check=False):
    """Performs a series of automated data cleaning transformations on the provided data set

    Parameters
    ----------
    input_dataframe: pandas.DataFrame
        Data set to clean
    drop_nans: bool
        Drop all rows that have a NaN in any column (default: False)
    copy: bool
        Make a copy of the data set (default: False)
    encoder: category_encoders transformer
        The a valid category_encoders transformer which is passed an inferred cols list. Default (None: LabelEncoder)
    encoder_kwargs: category_encoders
        The a valid sklearn transformer to encode categorical features. Default (None)
    ignore_update_check: bool
        Do not check for the latest version of datacleaner

    Returns
    ----------
    output_dataframe: pandas.DataFrame
        Cleaned data set

    """
    global update_checked
    if ignore_update_check:
        update_checked = True

    if not update_checked:
        update_check('datacleaner', __version__)
        update_checked = True

    if copy:
        input_dataframe = input_dataframe.copy()

    if drop_nans:
        input_dataframe.dropna(inplace=True)

    if encoder_kwargs is None:
        encoder_kwargs = {}

    for column in input_dataframe.columns.values:
        # Replace NaNs with the median or mode of the column depending on the column type
        try:
            input_dataframe[column].fillna(input_dataframe[column].median(), inplace=True)
        except TypeError:
            input_dataframe[column].fillna(input_dataframe[column].mode()[0], inplace=True)

        # Encode all strings with numerical equivalents
        if str(input_dataframe[column].values.dtype) == 'object':
            if encoder is not None:
                column_encoder = encoder(**encoder_kwargs).fit(input_dataframe[column].values)
            else:
                column_encoder = LabelEncoder().fit(input_dataframe[column].values)

            input_dataframe[column] = column_encoder.transform(input_dataframe[column].values)

    return input_dataframe
Пример #3
0
 def test_update_check__successful(self):
     prev_stdout = sys.stdout
     sys.stdout = StringIO()
     try:
         update_check(self.TRACKED_PACKAGE, '0.0.1', bypass_cache=True)
     finally:
         result = sys.stdout
         sys.stdout = prev_stdout
     self.assertTrue(len(result.getvalue()) > 0)
Пример #4
0
 def test_update_check_failed(self):
     prev_stdout = sys.stdout
     sys.stdout = StringIO()
     try:
         update_check('update_checker_slkdflj', '0.0.1')
     finally:
         result = sys.stdout
         sys.stdout = prev_stdout
     self.assertTrue(len(result.getvalue()) == 0)
Пример #5
0
 def test_update_check_successful(self):
     prev_stdout = sys.stdout
     sys.stdout = StringIO()
     try:
         update_check('update_checker', '0.0.1')
     finally:
         result = sys.stdout
         sys.stdout = prev_stdout
     self.assertTrue(len(result.getvalue()) > 0)
Пример #6
0
def autoclean(input_dataframe, drop_nans=False, copy=False, encoder=None, encoder_kwargs=None):
    """Performs a series of automated data cleaning transformations on the provided data set

    Parameters
    ----------
    input_dataframe: pandas.DataFrame
        Data set to clean

    drop_nans: bool
        Drop all rows that have a NaN in any column (default: False)

    copy: bool
        Make a copy of the data set (default: False)

    encoder: category_encoders transformer
        The a valid category_encoders transformer which is passed an inferred cols list. Default (None: LabelEncoder)

    encoder_kwargs: category_encoders
        The a valid sklearn transformer to encode categorical features. Default (None)

    Returns
    ----------
    output_dataframe: pandas.DataFrame
        Cleaned data set

    """
    global update_checked
    if not update_checked:
        update_check('datacleaner', __version__)
        update_checked = True

    if encoder_kwargs is None:
        encoder_kwargs = {}

    if encoder is None:
        encoder = LabelEncoder

    if copy:
        input_dataframe = input_dataframe.copy()
    
    if drop_nans:
        input_dataframe.dropna(inplace=True)

    for column in input_dataframe.columns.values:
        # Replace NaNs with the median or mode of the column depending on the column type
        # If there are very many levels in the column, then it is probably continuous
        if len(input_dataframe[column].unique()) > 0.2 * len(input_dataframe):
            input_dataframe[column].fillna(input_dataframe[column].median(), inplace=True)
        else:
            input_dataframe[column].fillna(input_dataframe[column].mode()[0], inplace=True)

        # Encode all strings with numerical equivalents
        if str(input_dataframe[column].values.dtype) == 'object':
            input_dataframe[column] = encoder(**encoder_kwargs).fit_transform(input_dataframe[column].values)

    return input_dataframe
Пример #7
0
    def __init__(self, config_section, plugin_dir, enable_logging):
        if not self.update_checked:
            update_check(__name__, __version__)
            self.update_checked = True

        self.start_time = datetime.utcnow()

        if plugin_dir:
            if os.path.isdir(plugin_dir):
                sys.path.append(plugin_dir)
            else:
                print ("`{0}` is not a directory.".format(plugin_dir))

        config = self._get_config(config_section)
        self._delayed_events = []
        self._loaded_plugins = {}
        self.api = Bot(config["auth_id"], config["user_id"], rate_limit=0.575)
        self.api.debug = enable_logging
        self.api.on("add_dj", self.handle_add_dj)
        self.api.on("booted_user", self.handle_booted_user)
        self.api.on("deregistered", self.handle_user_leave)
        self.api.on("new_moderator", self.handle_add_moderator)
        self.api.on("post_message", self.run_delayed_events)
        self.api.on("pmmed", self.handle_pm)
        self.api.on("ready", self.handle_ready)
        self.api.on("registered", self.handle_user_join)
        self.api.on("rem_dj", self.handle_remove_dj)
        self.api.on("rem_moderator", self.handle_remove_moderator)
        self.api.on("roomChanged", self.handle_room_change)
        self.api.on("speak", self.handle_room_message)
        self.bot_id = config["user_id"]
        self.commands = {
            "/about": self.cmd_about,
            "/commands": self.cmd_commands,
            "/help": self.cmd_help,
            "/join": self.cmd_join,
            "/leave": self.cmd_leave,
            "/pgload": self.cmd_plugin_load,
            "/pgreload": self.cmd_plugin_reload,
            "/pgunload": self.cmd_plugin_unload,
            "/plugins": self.cmd_plugins,
            "/uptime": self.cmd_uptime,
        }
        self.config = config
        self.dj_ids = set()
        self.listener_ids = set()
        self.max_djs = None
        self.moderator_ids = set()
        self.username = None

        # Load plugins after everything has been initialized
        for plugin in config["plugins"].split("\n"):
            self.load_plugin(plugin)

        self.api.connect(config["room_id"])
        self.api.ws.on_error = handle_error
Пример #8
0
 def test_update_check__untracked_package(self):
     prev_stdout = sys.stdout
     sys.stdout = StringIO()
     try:
         update_check(self.UNTRACKED_PACKAGE, '0.0.1', bypass_cache=True)
     finally:
         result = sys.stdout
         sys.stdout = prev_stdout
     self.assertEqual("update_checker does not support 'requests'\n",
                      result.getvalue())
Пример #9
0
 def test_update_check__unsuccessful(self):
     prev_stdout = sys.stdout
     sys.stdout = StringIO()
     try:
         update_check(self.TRACKED_PACKAGE, '0.0.1', bypass_cache=True,
                      url='http://sdlkjsldfkjsdlkfj.com')
     finally:
         result = sys.stdout
         sys.stdout = prev_stdout
     self.assertTrue(len(result.getvalue()) == 0)
Пример #10
0
 def test_update_check__untracked_package(self):
     prev_stdout = sys.stdout
     sys.stdout = StringIO()
     try:
         update_check(self.UNTRACKED_PACKAGE, '0.0.1', bypass_cache=True)
     finally:
         result = sys.stdout
         sys.stdout = prev_stdout
     self.assertEqual("update_checker does not support 'requests'\n",
                      result.getvalue())
Пример #11
0
    def __init__(self, config_section, plugin_dir, enable_logging):
        if not self.update_checked:
            update_check(__name__, __version__)
            self.update_checked = True

        self.start_time = datetime.utcnow()

        if plugin_dir:
            if os.path.isdir(plugin_dir):
                sys.path.append(plugin_dir)
            else:
                print('`{0}` is not a directory.'.format(plugin_dir))

        config = self._get_config(config_section)
        self._delayed_events = []
        self._loaded_plugins = {}
        self.api = Bot(config['auth_id'], config['user_id'], rate_limit=0.575)
        self.api.debug = enable_logging
        self.api.on('add_dj', self.handle_add_dj)
        self.api.on('booted_user', self.handle_booted_user)
        self.api.on('deregistered', self.handle_user_leave)
        self.api.on('new_moderator', self.handle_add_moderator)
        self.api.on('post_message', self.run_delayed_events)
        self.api.on('pmmed', self.handle_pm)
        self.api.on('ready', self.handle_ready)
        self.api.on('registered', self.handle_user_join)
        self.api.on('rem_dj', self.handle_remove_dj)
        self.api.on('rem_moderator', self.handle_remove_moderator)
        self.api.on('roomChanged', self.handle_room_change)
        self.api.on('speak', self.handle_room_message)
        self.bot_id = config['user_id']
        self.commands = {'/about': self.cmd_about,
                         '/commands': self.cmd_commands,
                         '/help': self.cmd_help,
                         '/join': self.cmd_join,
                         '/leave': self.cmd_leave,
                         '/pgload': self.cmd_plugin_load,
                         '/pgreload': self.cmd_plugin_reload,
                         '/pgunload': self.cmd_plugin_unload,
                         '/plugins': self.cmd_plugins,
                         '/uptime': self.cmd_uptime}
        self.config = config
        self.dj_ids = set()
        self.listener_ids = set()
        self.max_djs = None
        self.moderator_ids = set()
        self.username = None

        # Load plugins after everything has been initialized
        for plugin in config['plugins'].split('\n'):
            self.load_plugin(plugin)

        self.api.connect(config['room_id'])
        self.api.ws.on_error = handle_error
Пример #12
0
 def test_update_check__unsuccessful(self):
     prev_stdout = sys.stdout
     sys.stdout = StringIO()
     try:
         update_check(self.TRACKED_PACKAGE,
                      '0.0.1',
                      bypass_cache=True,
                      url='http://sdlkjsldfkjsdlkfj.com')
     finally:
         result = sys.stdout
         sys.stdout = prev_stdout
     self.assertTrue(len(result.getvalue()) == 0)
Пример #13
0
def main():
    """Provide the entry point to the hackday_bot command."""
    args = docopt(__doc__, version='hackday_bot v{}'.format(__version__))
    logger = prepare_logger('DEBUG' if args['--debug'] else 'INFO')
    update_check(__package__, __version__)

    reddit = praw.Reddit(args['SITE'],
                         check_for_updates=False,
                         user_agent='hackday_bot/{}'.format(__version__))
    subreddit = reddit.subreddit(args['SUBREDDIT'])
    try:
        subreddit.name
    except PrawcoreException:
        logger.error('Invalid subreddit: {}'.format(args['SUBREDDIT']))
        return 1
    return Bot(subreddit).run()
Пример #14
0
def autoclean(input_dataframe, drop_nans=False, copy=False, encoder=None,
              encoder_kwargs=None, ignore_update_check=False,**kwargs):
    """Performs a series of automated data cleaning transformations on the provided data set

    Parameters
    ----------
    input_dataframe: pandas.DataFrame
        Data set to clean
    drop_nans: bool
        Drop all rows that have a NaN in any column (default: False)
    copy: bool
        Make a copy of the data set (default: False)
    encoder: category_encoders transformer
        The a valid category_encoders transformer which is passed an inferred cols list. Default (None: LabelEncoder)
    encoder_kwargs: category_encoders
        The a valid sklearn transformer to encode categorical features. Default (None)
    ignore_update_check: bool
        Do not check for the latest version of datacleaner

    fill_func : function or method or string in 'full_func_list'
        the function to fill nan

    Returns
    ----------
    output_dataframe: pandas.DataFrame
        Cleaned data set

    """
    global update_checked
    if ignore_update_check:
        update_checked = True

    if not update_checked:
        update_check('datacleaner', __version__)
        update_checked = True

    if copy:
        input_dataframe = input_dataframe.copy()

    if drop_nans:
        input_dataframe.dropna(inplace=True)

    if encoder_kwargs is None:
        encoder_kwargs = {}

    fill_func = kwargs.pop('fill_func',"median")

    import inspect
    assert inspect.isfunction(fill_func) or inspect.ismethod(fill_func) or type(fill_func) == str

    full_func_list = [
        'sum', 'max', 'min', 'argmax', 'argmin', 'mean',
        'median','prod'
    ]

    if type(fill_func) == str and fill_func in full_func_list:
        fill_func = "nan{func}".format(func=fill_func)
        mod = __import__("numpy.lib.nanfunctions",fromlist=[fill_func])
        fill_func = getattr(mod,fill_func)

    for column in input_dataframe.columns.values:
        # Replace NaNs with the median or mode of the column depending on the column type
        try:
            input_dataframe[column].fillna(fill_func(input_dataframe[column]), inplace=True)
        except TypeError:
            most_frequent = input_dataframe[column].mode()
            # If the mode can't be computed, use the nearest valid value
            # See https://github.com/rhiever/datacleaner/issues/8
            if len(most_frequent) > 0:
                input_dataframe[column].fillna(input_dataframe[column].mode()[0], inplace=True)
            else:
                input_dataframe[column].fillna(method='bfill', inplace=True)
                input_dataframe[column].fillna(method='ffill', inplace=True)


        # Encode all strings with numerical equivalents
        if str(input_dataframe[column].values.dtype) == 'object':
            if encoder is not None:
                column_encoder = encoder(**encoder_kwargs).fit(input_dataframe[column].values)
            else:
                column_encoder = LabelEncoder().fit(input_dataframe[column].values)

            input_dataframe[column] = column_encoder.transform(input_dataframe[column].values)

    return input_dataframe
Пример #15
0
def autoclean_cv(training_dataframe, testing_dataframe, drop_nans=False, copy=False, encoder=None, encoder_kwargs=None):
    """Performs a series of automated data cleaning transformations on the provided training and testing data sets

    Unlike `autoclean()`, this function takes cross-validation into account by learning the data transformations
    from only the training set, then applying those transformations to both the training and testing set.
    By doing so, this function will prevent information leak from the training set into the testing set.

    Parameters
    ----------
    training_dataframe: pandas.DataFrame
        Training data set

    testing_dataframe: pandas.DataFrame
        Testing data set

    drop_nans: bool
        Drop all rows that have a NaN in any column (default: False)

    copy: bool
        Make a copy of the data set (default: False)

    encoder: category_encoders transformer
        The a valid category_encoders transformer which is passed an inferred cols list. Default (None: LabelEncoder)

    encoder_kwargs: category_encoders
        The a valid sklearn transformer to encode categorical features. Default (None)

    Returns
    ----------
    output_training_dataframe: pandas.DataFrame
        Cleaned training data set

    output_testing_dataframe: pandas.DataFrame
        Cleaned testing data set

    """
    global update_checked
    if not update_checked:
        update_check('datacleaner', __version__)
        update_checked = True

    if set(training_dataframe.columns.values) != set(testing_dataframe.columns.values):
        raise ValueError('The training and testing DataFrames do not have the same columns. '
                         'Make sure that you are providing the same columns.')

    if encoder_kwargs is None:
        encoder_kwargs = {}

    if encoder is None:
        encoder = LabelEncoder

    if copy:
        training_dataframe = training_dataframe.copy()
        testing_dataframe = testing_dataframe.copy()
    
    if drop_nans:
        training_dataframe.dropna(inplace=True)
        testing_dataframe.dropna(inplace=True)

    for column in training_dataframe.columns.values:
        # Replace NaNs with the median or mode of the column depending on the column type
        # If there are very many levels in the column, then it is probably continuous
        if len(training_dataframe[column].unique()) > 0.2 * len(training_dataframe):
            column_median = training_dataframe[column].median()
            training_dataframe[column].fillna(column_median, inplace=True)
            testing_dataframe[column].fillna(column_median, inplace=True)
        else:
            column_mode = training_dataframe[column].mode()[0]
            training_dataframe[column].fillna(column_mode, inplace=True)
            testing_dataframe[column].fillna(column_mode, inplace=True)

        # Encode all strings with numerical equivalents
        if str(training_dataframe[column].values.dtype) == 'object':
            if encoder is None:
                column_label_encoder = encoder(**encoder_kwargs).fit(training_dataframe[column].values)
                training_dataframe[column] = column_label_encoder.transform(training_dataframe[column].values)
                testing_dataframe[column] = column_label_encoder.transform(testing_dataframe[column].values)

    return training_dataframe, testing_dataframe
Пример #16
0
def main():
    """Provide the entry point into the reddit_alert program."""
    usage = 'Usage: %prog [options] KEYWORD...'
    parser = arg_parser(usage=usage)
    parser.add_option('-s', '--subreddit', action='append',
                      help=('When at least one `-s` option is provided '
                            '(multiple can be) only alert for comments in the '
                            'indicated subreddit(s).'))
    parser.add_option('-I', '--ignore-user', action='append', metavar='USER',
                      help=('Ignore comments from the provided user. Can be '
                            'supplied multiple times.'))
    parser.add_option('-m', '--message', metavar='USER',
                      help=('When set, send a reddit message to USER with the '
                            'alert. Requires the alert script to login.'))
    options, args = parser.parse_args()
    if not args:
        parser.error('At least one KEYWORD must be provided.')

    # Create the reddit session, and login if necessary
    session = praw.Reddit('reddit_alert (prawtools {0})'.format(__version__),
                          site_name=options.site, disable_update_check=True)
    if options.message:
        session.login(options.user, options.pswd)
        msg_to = session.get_redditor(options.message)

    # Check for updates
    if not options.disable_update_check:
        update_check('prawtools', __version__)

    # Build regex
    args = [x.lower() for x in args]
    reg_prefix = r'(?:^|[^a-z])'  # Any character (or start) can precede
    reg_suffix = r'(?:$|[^a-z])'  # Any character (or end) can follow
    regex = re.compile(r'{0}({1}){2}'.format(reg_prefix, '|'.join(args),
                                             reg_suffix), re.IGNORECASE)

    # Determine subreddit or multireddit
    if options.subreddit:
        subreddit = '+'.join(sorted(options.subreddit))
    else:
        subreddit = 'all'

    print('Alerting on:')
    for item in sorted(args):
        print(' * {0}'.format(item))
    print ('using the comment stream: http://www.reddit.com/r/{0}/comments'
           .format(subreddit))

    # Build ignore set
    if options.ignore_user:
        ignore_users = set(x.lower() for x in options.ignore_user)
    else:
        ignore_users = set()

    try:
        for comment in praw.helpers.comment_stream(session, subreddit,
                                                   verbosity=options.verbose):
            if comment.author and comment.author.name.lower() in ignore_users:
                continue
            match = regex.search(comment.body)
            if match:
                keyword = match.group(1).lower()
                url = quick_url(comment)
                print('{0}: {1}'.format(keyword, url))
                if options.message:
                    msg_to.send_message(
                        'Reddit Alert: {0}'.format(keyword),
                        '{0}\n\nby /u/{1}\n\n---\n\n{2}'.format(
                            url, comment.author, comment.body))
    except KeyboardInterrupt:
        sys.stderr.write('\n')
        print('Goodbye!\n')
Пример #17
0
# -*- coding: utf-8 -*-
'''
Copyright 2016 Randal S. Olson

This file is part of the TPOT library.

The TPOT library is free software: you can redistribute it and/or
modify it under the terms of the GNU General Public License as published by the
Free Software Foundation, either version 3 of the License, or (at your option)
any later version.

The TPOT library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
the TPOT library. If not, see http://www.gnu.org/licenses/.
'''

from ._version import __version__
from .tpot import TPOT, main
from update_checker import update_check

# Prompt the user if their version is out of date
update_check('tpot', __version__)
Пример #18
0
 def _check_for_update(self):
     if UPDATE_CHECKER_MISSING:
         return
     if not Reddit.update_checked and self.config.check_for_updates:
         update_check(__package__, __version__)
         Reddit.update_checked = True
Пример #19
0
    def _fit_init(self):
        # initialization for fit function
        if not self.warm_start or not hasattr(self, '_pareto_front'):
            self._pop = []
            self._pareto_front = None
            self._last_optimized_pareto_front = None
            self._last_optimized_pareto_front_n_gens = 0

        self._optimized_pipeline = None
        self._optimized_pipeline_score = None
        self._exported_pipeline_text = ""
        self.fitted_pipeline_ = None
        self._fitted_imputer = None
        self._imputed = False
        self._memory = None # initial Memory setting for sklearn pipeline

        # dont save periodic pipelines more often than this
        self._output_best_pipeline_period_seconds = 30

        # Try crossover and mutation at most this many times for
        # any one given individual (or pair of individuals)
        self._max_mut_loops = 50

        self._setup_config(self.config_dict)

        self.operators = []
        self.arguments = []
        for key in sorted(self._config_dict.keys()):
            op_class, arg_types = TPOTOperatorClassFactory(
                key,
                self._config_dict[key],
                BaseClass=Operator,
                ArgBaseClass=ARGType
            )
            if op_class:
                self.operators.append(op_class)
                self.arguments += arg_types

        # Schedule TPOT to run for many generations if the user specifies a
        # run-time limit TPOT will automatically interrupt itself when the timer
        # runs out
        if self.max_time_mins is not None:
            self.generations = 1000000

        # Prompt the user if their version is out of date
        if not self.disable_update_check:
            update_check('tpot', __version__)

        if self.mutation_rate + self.crossover_rate > 1:
            raise ValueError(
                'The sum of the crossover and mutation probabilities must be <= 1.0.'
            )

        self.operators_context = {
            'make_pipeline': make_pipeline,
            'make_union': make_union,
            'StackingEstimator': StackingEstimator,
            'FunctionTransformer': FunctionTransformer,
            'copy': copy
        }

        self._pbar = None
        # Specifies where to output the progress messages (default: sys.stdout).
        # Maybe open this API in future version of TPOT.(io.TextIOWrapper or io.StringIO)
        self._file = sys.stdout

        # Dictionary of individuals that have already been evaluated in previous
        # generations
        self.evaluated_individuals_ = {}

        self._setup_scoring_function(self.scoring)

        if self.subsample <= 0.0 or self.subsample > 1.0:
            raise ValueError(
                'The subsample ratio of the training instance must be in the range (0.0, 1.0].'
            )

        if self.n_jobs == -1:
            self._n_jobs = cpu_count()
        else:
            self._n_jobs = self.n_jobs

        self._setup_pset()
        self._setup_toolbox()

        ## Additions to _fit_init
        # Initialise list to save the predictions and pipelines analysed by TPOT
        self.predictions = []
        self.pipelines = []
        self._exported_pipeline_text = []
        # Save training sample on the TPOT Object
        self.features = None
        self.target = None
        self.evaluated_individuals = {}
        self.curr_generations = 0
        self.log = {}

        # Add the Gaussian kernels so that they can be used by TPOT
        self.operators_context['RBF'] = eval('RBF')
        self.operators_context['Matern'] = eval('Matern')
        self.operators_context['RationalQuadratic'] = eval('RationalQuadratic')
        self.operators_context['ExpSineSquared'] = eval('ExpSineSquared')
        self.operators_context['DotProduct'] = eval('DotProduct')
        self.operators_context['ConstantKernel'] = eval('ConstantKernel')
Пример #20
0
    def __init__(self, population_size=100, generations=100,
                 mutation_rate=0.9, crossover_rate=0.05,
                 scoring=None, num_cv_folds=3, max_time_mins=None, max_eval_time_mins=5,
                 random_state=None, verbosity=0,
                 disable_update_check=False):
        """Sets up the genetic programming algorithm for pipeline optimization.

        Parameters
        ----------
        population_size: int (default: 100)
            The number of pipelines in the genetic algorithm population. Must
            be > 0.The more pipelines in the population, the slower TPOT will
            run, but it's also more likely to find better pipelines.
        generations: int (default: 100)
            The number of generations to run pipeline optimization for. Must
            be > 0. The more generations you give TPOT to run, the longer it
            takes, but it's also more likely to find better pipelines.
        mutation_rate: float (default: 0.9)
            The mutation rate for the genetic programming algorithm in the range
            [0.0, 1.0]. This tells the genetic programming algorithm how many
            pipelines to apply random changes to every generation. We don't
            recommend that you tweak this parameter unless you know what you're
            doing.
        crossover_rate: float (default: 0.05)
            The crossover rate for the genetic programming algorithm in the
            range [0.0, 1.0]. This tells the genetic programming algorithm how
            many pipelines to "breed" every generation. We don't recommend that
            you tweak this parameter unless you know what you're doing.
        scoring: function or str
            Function used to evaluate the quality of a given pipeline for the
            problem. By default, balanced class accuracy is used for
            classification problems, mean squared error for regression problems.
            TPOT assumes that this scoring function should be maximized, i.e.,
            higher is better.

            Offers the same options as sklearn.model_selection.cross_val_score:

            ['accuracy', 'adjusted_rand_score', 'average_precision', 'f1',
            'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted',
            'precision', 'precision_macro', 'precision_micro', 'precision_samples',
            'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro',
            'recall_samples', 'recall_weighted', 'roc_auc']
        num_cv_folds: int (default: 3)
            The number of folds to evaluate each pipeline over in k-fold
            cross-validation during the TPOT pipeline optimization process
        max_time_mins: int (default: None)
            How many minutes TPOT has to optimize the pipeline. If not None,
            this setting will override the `generations` parameter.
        max_eval_time_mins: int (default: 5)
            How many minutes TPOT has to optimize a single pipeline.
            Setting this parameter to higher values will allow TPOT to explore more complex
            pipelines but will also allow TPOT to run longer.
        random_state: int (default: 0)
            The random number generator seed for TPOT. Use this to make sure
            that TPOT will give you the same results each time you run it
            against the same data set with that seed.
        verbosity: int (default: 0)
            How much information TPOT communicates while it's running.
            0 = none, 1 = minimal, 2 = all
        disable_update_check: bool (default: False)
            Flag indicating whether the TPOT version checker should be disabled.

        Returns
        -------
        None

        """
        if self.__class__.__name__ == 'TPOTBase':
            raise RuntimeError('Do not instantiate the TPOTBase class directly; '
                               'use TPOTRegressor or TPOTClassifier instead.')

        # Prompt the user if their version is out of date
        self.disable_update_check = disable_update_check
        if not self.disable_update_check:
            update_check('tpot', __version__)

        self._hof = None
        self._optimized_pipeline = None
        self._fitted_pipeline = None
        self.population_size = population_size
        self.generations = generations
        self.max_time_mins = max_time_mins
        self.max_eval_time_mins = max_eval_time_mins

        # Schedule TPOT to run for a very long time if the user specifies a run-time
        # limit TPOT will automatically interrupt itself when the timer runs out
        if not (max_time_mins is None):
            self.generations = 1000000

        self.mutation_rate = mutation_rate
        self.crossover_rate = crossover_rate
        self.verbosity = verbosity
        self.operators_context = {
            'make_pipeline': make_pipeline,
            'make_union': make_union,
            'VotingClassifier': VotingClassifier,
            'FunctionTransformer': FunctionTransformer
        }

        self._pbar = None
        self._gp_generation = 0

        self.random_state = random_state

        # If the user passed a custom scoring function, store it in the sklearn SCORERS dictionary
        if scoring:
            if hasattr(scoring, '__call__'):
                scoring_name = scoring.__name__

                if 'loss' in scoring_name or 'error' in scoring_name:
                    greater_is_better = False
                else:
                    greater_is_better = True

                SCORERS[scoring_name] = make_scorer(scoring, greater_is_better=greater_is_better)
                self.scoring_function = scoring_name
            else:
                self.scoring_function = scoring

        self.num_cv_folds = num_cv_folds

        self._setup_pset()
        self._setup_toolbox()
Пример #21
0
def main():
    """Provide the entry point to the subreddit_stats command.

    :returns: 0 on success, 1 otherwise

    """
    parser = arg_parser(usage='usage: %prog [options] [SUBREDDIT]')
    parser.add_option('-s',
                      '--submitters',
                      type='int',
                      default=5,
                      help='Number of top submitters to display '
                      '[default %default]')
    parser.add_option('-c',
                      '--commenters',
                      type='int',
                      default=10,
                      help='Number of top commenters to display '
                      '[default %default]')
    parser.add_option('-a', '--after', help='Submission ID to fetch after')
    parser.add_option('-d',
                      '--days',
                      type='int',
                      default=32,
                      help=('Number of previous days to include submissions '
                            'from. Use 0 for unlimited. Default: %default'))
    parser.add_option('-D',
                      '--debug',
                      action='store_true',
                      help='Enable debugging mode. Does not post stats.')
    parser.add_option('-R',
                      '--submission-reddit',
                      help=('Subreddit to submit to. If not present, '
                            'submits to the subreddit processed'))
    parser.add_option('-t',
                      '--top',
                      help=('Run on top submissions either by day, week, '
                            'month, year, or all'))
    parser.add_option('',
                      '--distinguished',
                      action='store_true',
                      help=('Include distinguished subissions and '
                            'comments (default: False). Note that regular '
                            'comments of distinguished submissions will still '
                            'be included.'))
    parser.add_option('',
                      '--no-self',
                      action='store_true',
                      help=('Do not include self posts (and their comments) in'
                            ' the calculation.'))
    parser.add_option('',
                      '--no-link',
                      action='store_true',
                      help=('Only include self posts (and their comments) in '
                            'the calculation.'))
    parser.add_option('',
                      '--prev',
                      help='Provide the submission id of previous SRS page.')
    parser.add_option('',
                      '--include-prev',
                      action='store_true',
                      help='Don\'t try to avoid overlap with a previous SRS.')
    parser.add_option('-o', '--output', help='Save result csv to named file.')

    options, args = parser.parse_args()
    if len(args) != 1:
        sys.stdout.write('Enter subreddit name: ')
        sys.stdout.flush()
        subject_reddit = sys.stdin.readline().strip()
        if not subject_reddit:
            parser.error('No subreddit name entered')
    else:
        subject_reddit = args[0]

    if not options.disable_update_check:  # Check for updates
        update_check('prawtools', __version__)

    print('You chose to analyze this subreddit: {}'.format(subject_reddit))

    if options.no_link and options.no_self:
        parser.error('You are choosing to exclude self posts but also only '
                     'include self posts. Consider checking your arguments.')

    if options.submission_reddit:
        submission_reddit = options.submission_reddit
    else:
        submission_reddit = subject_reddit

    srs = SubRedditStats(subject_reddit, options.site, options.verbose,
                         options.distinguished)
    if options.prev:
        srs.prev_stat(options.prev)
    if options.top:
        found = srs.fetch_top_submissions(options.top, options.no_self,
                                          options.no_link)
    else:
        since_last = not options.include_prev
        found = srs.fetch_recent_submissions(max_duration=options.days,
                                             after=options.after,
                                             exclude_self=options.no_self,
                                             exclude_link=options.no_link,
                                             since_last=since_last)
    if not found:
        print('No submissions were found.')
        return 1
    srs.process_submitters()
    if options.commenters > 0:
        srs.process_commenters()
    if options.output:
        srs.save_csv(options.output)
    srs.publish_results(submission_reddit, options.submitters,
                        options.commenters, 5, 5, options.top, options.debug)
Пример #22
0
def check_for_updates(options):
    """Check for package updates."""
    if not options.disable_update_check:  # Check for updates
        update_check('prawtools', __version__)
Пример #23
0
def test_update_check__unsuccessful(mock_get, capsys):
    mock_get.side_effect = requests.exceptions.RequestException
    update_check(PACKAGE, "0.0.1", bypass_cache=True)
    assert "" == capsys.readouterr().err
Пример #24
0
def test_update_check__successful__has_update(mock_get, capsys):
    mock_response(mock_get.return_value)
    update_check(PACKAGE, "0.0.1", bypass_cache=True)
    assert ("Version 0.0.1 of praw is outdated. Version 5.0.0 is available.\n"
            == capsys.readouterr().err)
Пример #25
0
def test_update_check__successful__has_no_update(mock_get, capsys):
    mock_response(mock_get.return_value, "0.0.2")
    update_check(PACKAGE, "0.0.2", bypass_cache=True)
    assert "" == capsys.readouterr().err
Пример #26
0
    def __init__(self, generations=100, population_size=100, offspring_size=None,
                 mutation_rate=0.9, crossover_rate=0.1,
                 scoring=None, cv=5, subsample=1.0, n_jobs=1,
                 max_time_mins=None, max_eval_time_mins=5,
                 random_state=None, config_dict=None, warm_start=False,
                 verbosity=0, disable_update_check=False):
        """Set up the genetic programming algorithm for pipeline optimization.

        Parameters
        ----------
        generations: int, optional (default: 100)
            Number of iterations to the run pipeline optimization process.
            Generally, TPOT will work better when you give it more generations (and
            therefore time) to optimize the pipeline. TPOT will evaluate
            POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total.
        population_size: int, optional (default: 100)
            Number of individuals to retain in the GP population every generation.
            Generally, TPOT will work better when you give it more individuals
            (and therefore time) to optimize the pipeline. TPOT will evaluate
            POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total.
        offspring_size: int, optional (default: None)
            Number of offspring to produce in each GP generation.
            By default, offspring_size = population_size.
        mutation_rate: float, optional (default: 0.9)
            Mutation rate for the genetic programming algorithm in the range [0.0, 1.0].
            This parameter tells the GP algorithm how many pipelines to apply random
            changes to every generation. We recommend using the default parameter unless
            you understand how the mutation rate affects GP algorithms.
        crossover_rate: float, optional (default: 0.1)
            Crossover rate for the genetic programming algorithm in the range [0.0, 1.0].
            This parameter tells the genetic programming algorithm how many pipelines to
            "breed" every generation. We recommend using the default parameter unless you
            understand how the mutation rate affects GP algorithms.
        scoring: string or callable, optional
            Function used to evaluate the quality of a given pipeline for the
            problem. By default, accuracy is used for classification problems and
            mean squared error (MSE) for regression problems.

            Offers the same options as sklearn.model_selection.cross_val_score as well as
            a built-in score 'balanced_accuracy'. Classification metrics:

            ['accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy',
            'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted',
            'precision', 'precision_macro', 'precision_micro', 'precision_samples',
            'precision_weighted', 'recall', 'recall_macro', 'recall_micro',
            'recall_samples', 'recall_weighted', 'roc_auc']

            Regression metrics:

            ['neg_median_absolute_error', 'neg_mean_absolute_error',
            'neg_mean_squared_error', 'r2']

            If you would like to use a custom scoring function, you can pass a callable
            function to this parameter with the signature scorer(y_true, y_pred).
            See the section on scoring functions in the documentation for more details.

            TPOT assumes that any custom scoring function with "error" or "loss" in the
            name is meant to be minimized, whereas any other functions will be maximized.
        cv: int or cross-validation generator, optional (default: 5)
            If CV is a number, then it is the number of folds to evaluate each
            pipeline over in k-fold cross-validation during the TPOT optimization
             process. If it is an object then it is an object to be used as a
             cross-validation generator.
        subsample: float, optional (default: 1.0)
            Subsample ratio of the training instance. Setting it to 0.5 means that TPOT
            randomly collects half of training samples for pipeline optimization process.
        n_jobs: int, optional (default: 1)
            Number of CPUs for evaluating pipelines in parallel during the TPOT
            optimization process. Assigning this to -1 will use as many cores as available
            on the computer.
        max_time_mins: int, optional (default: None)
            How many minutes TPOT has to optimize the pipeline.
            If provided, this setting will override the "generations" parameter and allow
            TPOT to run until it runs out of time.
        max_eval_time_mins: int, optional (default: 5)
            How many minutes TPOT has to optimize a single pipeline.
            Setting this parameter to higher values will allow TPOT to explore more
            complex pipelines, but will also allow TPOT to run longer.
        random_state: int, optional (default: None)
            Random number generator seed for TPOT. Use this parameter to make sure
            that TPOT will give you the same results each time you run it against the
            same data set with that seed.
        config_dict: a Python dictionary or string, optional (default: None)
            Python dictionary:
                A dictionary customizing the operators and parameters that
                TPOT uses in the optimization process.
                For examples, see config_regressor.py and config_classifier.py
            Path for configuration file:
                A path to a configuration file for customizing the operators and parameters that
                TPOT uses in the optimization process.
                For examples, see config_regressor.py and config_classifier.py
            String 'TPOT light':
                TPOT uses a light version of operator configuration dictionary instead of
                the default one.
            String 'TPOT MDR':
                TPOT uses a list of TPOT-MDR operator configuration dictionary instead of
                the default one.
        warm_start: bool, optional (default: False)
            Flag indicating whether the TPOT instance will reuse the population from
            previous calls to fit().
        verbosity: int, optional (default: 0)
            How much information TPOT communicates while it's running.
            0 = none, 1 = minimal, 2 = high, 3 = all.
            A setting of 2 or higher will add a progress bar during the optimization procedure.
        disable_update_check: bool, optional (default: False)
            Flag indicating whether the TPOT version checker should be disabled.

        Returns
        -------
        None

        """
        if self.__class__.__name__ == 'TPOTBase':
            raise RuntimeError('Do not instantiate the TPOTBase class directly; use TPOTRegressor or TPOTClassifier instead.')

        # Prompt the user if their version is out of date
        self.disable_update_check = disable_update_check
        if not self.disable_update_check:
            update_check('tpot', __version__)

        self._pareto_front = None
        self._optimized_pipeline = None
        self.fitted_pipeline_ = None
        self._fitted_imputer = None
        self._pop = None
        self.warm_start = warm_start
        self.population_size = population_size
        self.generations = generations
        self.max_time_mins = max_time_mins
        self.max_eval_time_mins = max_eval_time_mins

        # Set offspring_size equal to population_size by default
        if offspring_size:
            self.offspring_size = offspring_size
        else:
            self.offspring_size = population_size

        self._setup_config(config_dict)

        self.operators = []
        self.arguments = []
        for key in sorted(self.config_dict.keys()):
            op_class, arg_types = TPOTOperatorClassFactory(
                key,
                self.config_dict[key],
                BaseClass=Operator,
                ArgBaseClass=ARGType
            )
            if op_class:
                self.operators.append(op_class)
                self.arguments += arg_types

        # Schedule TPOT to run for many generations if the user specifies a
        # run-time limit TPOT will automatically interrupt itself when the timer
        # runs out
        if max_time_mins is not None:
            self.generations = 1000000

        self.mutation_rate = mutation_rate
        self.crossover_rate = crossover_rate

        if self.mutation_rate + self.crossover_rate > 1:
            raise ValueError(
                'The sum of the crossover and mutation probabilities must be <= 1.0.'
            )

        self.verbosity = verbosity
        self.operators_context = {
            'make_pipeline': make_pipeline,
            'make_union': make_union,
            'StackingEstimator': StackingEstimator,
            'FunctionTransformer': FunctionTransformer,
            'copy': copy
        }
        self._pbar = None

        # Dictionary of individuals that have already been evaluated in previous
        # generations
        self.evaluated_individuals_ = {}
        self.random_state = random_state

        # If the user passed a custom scoring function, store it in the sklearn
        # SCORERS dictionary
        if scoring:
            if hasattr(scoring, '__call__'):
                scoring_name = scoring.__name__
                greater_is_better = 'loss' not in scoring_name and 'error' not in scoring_name
                SCORERS[scoring_name] = make_scorer(scoring, greater_is_better=greater_is_better)
                self.scoring_function = scoring_name
            else:
                if scoring not in SCORERS:
                    raise ValueError(
                        'The scoring function {} is not available. Please '
                        'choose a valid scoring function from the TPOT '
                        'documentation.'.format(scoring)
                    )
                self.scoring_function = scoring

        self.cv = cv
        self.subsample = subsample
        if self.subsample <= 0.0 or self.subsample > 1.0:
            raise ValueError(
                'The subsample ratio of the training instance must be in the range (0.0, 1.0].'
            )
        # If the OS is windows, reset cpu number to 1 since the OS did not have multiprocessing module
        if sys.platform.startswith('win') and n_jobs != 1:
            print(
                'Warning: Although parallelization is currently supported in '
                'TPOT for Windows, pressing Ctrl+C will freeze the optimization '
                'process without saving the best pipeline! Thus, Please DO NOT '
                'press Ctrl+C during the optimization procss if n_jobs is not '
                'equal to 1. For quick test in Windows, please set n_jobs to 1 '
                'for saving the best pipeline in the middle of the optimization '
                'process via Ctrl+C.'
            )
        if n_jobs == -1:
            self.n_jobs = cpu_count()
        else:
            self.n_jobs = n_jobs

        self._setup_pset()
        self._setup_toolbox()
Пример #27
0
    def __init__(self, population_size=100, generations=100,
                 mutation_rate=0.9, crossover_rate=0.05,
                 random_state=None, verbosity=0,
                 scoring_function=None, num_cv_folds=3,
                 disable_update_check=False):
        """Sets up the genetic programming algorithm for pipeline optimization.

        Parameters
        ----------
        population_size: int (default: 100)
            The number of pipelines in the genetic algorithm population. Must
            be > 0.The more pipelines in the population, the slower TPOT will
            run, but it's also more likely to find better pipelines.
        generations: int (default: 100)
            The number of generations to run pipeline optimization for. Must
            be > 0. The more generations you give TPOT to run, the longer it
            takes, but it's also more likely to find better pipelines.
        mutation_rate: float (default: 0.9)
            The mutation rate for the genetic programming algorithm in the range
            [0.0, 1.0]. This tells the genetic programming algorithm how many
            pipelines to apply random changes to every generation. We don't
            recommend that you tweak this parameter unless you know what you're
            doing.
        crossover_rate: float (default: 0.05)
            The crossover rate for the genetic programming algorithm in the
            range [0.0, 1.0]. This tells the genetic programming algorithm how
            many pipelines to "breed" every generation. We don't recommend that
            you tweak this parameter unless you know what you're doing.
        random_state: int (default: 0)
            The random number generator seed for TPOT. Use this to make sure
            that TPOT will give you the same results each time you run it
            against the same data set with that seed.
        verbosity: int (default: 0)
            How much information TPOT communicates while it's running.
            0 = none, 1 = minimal, 2 = all
        scoring_function: str (default: balanced accuracy)
            Function used to evaluate the goodness of a given pipeline for the
            classification problem. By default, balanced class accuracy is used.
            TPOT assumes that this scoring function should be maximized, i.e.,
            higher is better.

            Offers the same options as sklearn.cross_validation.cross_val_score:

            ['accuracy', 'adjusted_rand_score', 'average_precision', 'f1', 'f1_macro',
            'f1_micro', 'f1_samples', 'f1_weighted', 'log_loss', 'precision', 'precision_macro',
            'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall',
            'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc']
        num_cv_folds: int (default: 3)
            The number of folds to evaluate each pipeline over in k-fold cross-validation
            during the TPOT pipeline optimization process
        disable_update_check: bool (default: False)
            Flag indicating whether the TPOT version checker should be disabled.

        Returns
        -------
        None

        """
        # Save params to be recalled later by get_params()
        self.params = locals()  # Must be before any local variable definitions
        self.params.pop('self')

        # Prompt the user if their version is out of date
        if not disable_update_check:
            update_check('tpot', __version__)

        self.hof = None
        self._optimized_pipeline = None
        self._fitted_pipeline = None
        self.population_size = population_size
        self.generations = generations
        self.mutation_rate = mutation_rate
        self.crossover_rate = crossover_rate
        self.verbosity = verbosity
        self.operators_context = {
            'make_pipeline': make_pipeline,
            'make_union': make_union,
            'VotingClassifier': VotingClassifier,
            'FunctionTransformer': FunctionTransformer
        }

        self.pbar = None
        self.gp_generation = 0
        self.random_state = random_state

        if scoring_function is None:
            self.scoring_function = self._balanced_accuracy
        else:
            self.scoring_function = scoring_function

        self.num_cv_folds = num_cv_folds

        self._setup_pset()
        self._setup_toolbox()
Пример #28
0
def main():
    """Provide the entry point into the reddit_alert program."""
    usage = 'Usage: %prog [options] KEYWORD...'
    parser = arg_parser(usage=usage)
    parser.add_option('-s',
                      '--subreddit',
                      action='append',
                      help=('When at least one `-s` option is provided '
                            '(multiple can be) only alert for comments in the '
                            'indicated subreddit(s).'))
    parser.add_option('-I',
                      '--ignore-user',
                      action='append',
                      metavar='USER',
                      help=('Ignore comments from the provided user. Can be '
                            'supplied multiple times.'))
    parser.add_option('-m',
                      '--message',
                      metavar='USER',
                      help=('When set, send a reddit message to USER with the '
                            'alert. Requires the alert script to login.'))
    options, args = parser.parse_args()
    if not args:
        parser.error('At least one KEYWORD must be provided.')

    # Create the reddit session, and login if necessary
    session = praw.Reddit('reddit_alert (prawtools {0})'.format(__version__),
                          site_name=options.site,
                          disable_update_check=True)
    if options.message:
        session.login(options.user, options.pswd)
        msg_to = session.get_redditor(options.message)

    # Check for updates
    if not options.disable_update_check:
        update_check('prawtools', __version__)

    # Build regex
    args = [x.lower() for x in args]
    reg_prefix = r'(?:^|[^a-z])'  # Any character (or start) can precede
    reg_suffix = r'(?:$|[^a-z])'  # Any character (or end) can follow
    regex = re.compile(
        r'{0}({1}){2}'.format(reg_prefix, '|'.join(args), reg_suffix),
        re.IGNORECASE)

    # Determine subreddit or multireddit
    if options.subreddit:
        subreddit = '+'.join(sorted(options.subreddit))
    else:
        subreddit = 'all'

    print('Alerting on:')
    for item in sorted(args):
        print(' * {0}'.format(item))
    print('using the comment stream: http://www.reddit.com/r/{0}/comments'.
          format(subreddit))

    # Build ignore set
    if options.ignore_user:
        ignore_users = set(x.lower() for x in options.ignore_user)
    else:
        ignore_users = set()

    try:
        for comment in praw.helpers.comment_stream(session,
                                                   subreddit,
                                                   verbosity=options.verbose):
            if comment.author and comment.author.name.lower() in ignore_users:
                continue
            match = regex.search(comment.body)
            if match:
                keyword = match.group(1).lower()
                url = quick_url(comment)
                print('{0}: {1}'.format(keyword, url))
                if options.message:
                    msg_to.send_message(
                        'Reddit Alert: {0}'.format(keyword),
                        '{0}\n\nby /u/{1}\n\n---\n\n{2}'.format(
                            url, comment.author, comment.body))
    except KeyboardInterrupt:
        sys.stderr.write('\n')
        print('Goodbye!\n')
Пример #29
0
# -*- coding: utf-8 -*-

'''
Copyright 2016 Randal S. Olson

This file is part of the TPOT library.

The TPOT library is free software: you can redistribute it and/or
modify it under the terms of the GNU General Public License as published by the
Free Software Foundation, either version 3 of the License, or (at your option)
any later version.

The TPOT library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
the TPOT library. If not, see http://www.gnu.org/licenses/.
'''

from ._version import __version__
from .tpot import TPOT, main
from update_checker import update_check

# Prompt the user if their version is out of date
update_check('tpot', __version__)
Пример #30
0
    def __init__(self,
                 population_size=100,
                 generations=100,
                 mutation_rate=0.9,
                 crossover_rate=0.05,
                 scoring=None,
                 num_cv_folds=3,
                 max_time_mins=None,
                 max_eval_time_mins=5,
                 random_state=None,
                 verbosity=0,
                 disable_update_check=False):
        """Sets up the genetic programming algorithm for pipeline optimization.

        Parameters
        ----------
        population_size: int (default: 100)
            The number of pipelines in the genetic algorithm population. Must
            be > 0.The more pipelines in the population, the slower TPOT will
            run, but it's also more likely to find better pipelines.
        generations: int (default: 100)
            The number of generations to run pipeline optimization for. Must
            be > 0. The more generations you give TPOT to run, the longer it
            takes, but it's also more likely to find better pipelines.
        mutation_rate: float (default: 0.9)
            The mutation rate for the genetic programming algorithm in the range
            [0.0, 1.0]. This tells the genetic programming algorithm how many
            pipelines to apply random changes to every generation. We don't
            recommend that you tweak this parameter unless you know what you're
            doing.
        crossover_rate: float (default: 0.05)
            The crossover rate for the genetic programming algorithm in the
            range [0.0, 1.0]. This tells the genetic programming algorithm how
            many pipelines to "breed" every generation. We don't recommend that
            you tweak this parameter unless you know what you're doing.
        scoring: function or str
            Function used to evaluate the quality of a given pipeline for the
            problem. By default, balanced class accuracy is used for
            classification problems, mean squared error for regression problems.
            TPOT assumes that this scoring function should be maximized, i.e.,
            higher is better.

            Offers the same options as sklearn.model_selection.cross_val_score:

            ['accuracy', 'adjusted_rand_score', 'average_precision', 'f1',
            'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted',
            'precision', 'precision_macro', 'precision_micro', 'precision_samples',
            'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro',
            'recall_samples', 'recall_weighted', 'roc_auc']
        num_cv_folds: int (default: 3)
            The number of folds to evaluate each pipeline over in k-fold
            cross-validation during the TPOT pipeline optimization process
        max_time_mins: int (default: None)
            How many minutes TPOT has to optimize the pipeline. If not None,
            this setting will override the `generations` parameter.
        max_eval_time_mins: int (default: 5)
            How many minutes TPOT has to optimize a single pipeline.
            Setting this parameter to higher values will allow TPOT to explore more complex
            pipelines but will also allow TPOT to run longer.
        random_state: int (default: 0)
            The random number generator seed for TPOT. Use this to make sure
            that TPOT will give you the same results each time you run it
            against the same data set with that seed.
        verbosity: int (default: 0)
            How much information TPOT communicates while it's running.
            0 = none, 1 = minimal, 2 = all
        disable_update_check: bool (default: False)
            Flag indicating whether the TPOT version checker should be disabled.

        Returns
        -------
        None

        """
        if self.__class__.__name__ == 'TPOTBase':
            raise RuntimeError(
                'Do not instantiate the TPOTBase class directly; '
                'use TPOTRegressor or TPOTClassifier instead.')

        # Prompt the user if their version is out of date
        self.disable_update_check = disable_update_check
        if not self.disable_update_check:
            update_check('tpot', __version__)

        self._hof = None
        self._optimized_pipeline = None
        self._fitted_pipeline = None
        self.population_size = population_size
        self.generations = generations
        self.max_time_mins = max_time_mins
        self.max_eval_time_mins = max_eval_time_mins

        # Schedule TPOT to run for a very long time if the user specifies a run-time
        # limit TPOT will automatically interrupt itself when the timer runs out
        if not (max_time_mins is None):
            self.generations = 1000000

        self.mutation_rate = mutation_rate
        self.crossover_rate = crossover_rate
        self.verbosity = verbosity
        self.operators_context = {
            'make_pipeline': make_pipeline,
            'make_union': make_union,
            'VotingClassifier': VotingClassifier,
            'FunctionTransformer': FunctionTransformer
        }

        self._pbar = None
        self._gp_generation = 0

        self.random_state = random_state

        # If the user passed a custom scoring function, store it in the sklearn SCORERS dictionary
        if scoring:
            if hasattr(scoring, '__call__'):
                scoring_name = scoring.__name__

                if 'loss' in scoring_name or 'error' in scoring_name:
                    greater_is_better = False
                else:
                    greater_is_better = True

                SCORERS[scoring_name] = make_scorer(
                    scoring, greater_is_better=greater_is_better)
                self.scoring_function = scoring_name
            else:
                self.scoring_function = scoring

        self.num_cv_folds = num_cv_folds

        self._setup_pset()
        self._setup_toolbox()
Пример #31
0
    def __init__(self,
                 generations=100,
                 population_size=100,
                 offspring_size=None,
                 mutation_rate=0.9,
                 crossover_rate=0.1,
                 scoring=None,
                 cv=5,
                 n_jobs=1,
                 max_time_mins=None,
                 max_eval_time_mins=5,
                 random_state=None,
                 config_dict=None,
                 warm_start=False,
                 verbosity=0,
                 disable_update_check=False):
        """Sets up the genetic programming algorithm for pipeline optimization.

        Parameters
        ----------
        generations: int (default: 100)
            Number of iterations to the run pipeline optimization process.
            Generally, TPOT will work better when you give it more generations (and
            therefore time) to optimize the pipeline. TPOT will evaluate
            POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total.
        population_size: int (default: 100)
            Number of individuals to retain in the GP population every generation.
            Generally, TPOT will work better when you give it more individuals
            (and therefore time) to optimize the pipeline. TPOT will evaluate
            POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total.
        offspring_size: int (default: None)
            Number of offspring to produce in each GP generation.
            By default, offspring_size = population_size.
        mutation_rate: float (default: 0.9)
            Mutation rate for the genetic programming algorithm in the range [0.0, 1.0].
            This parameter tells the GP algorithm how many pipelines to apply random
            changes to every generation. We recommend using the default parameter unless
            you understand how the mutation rate affects GP algorithms.
        crossover_rate: float (default: 0.1)
            Crossover rate for the genetic programming algorithm in the range [0.0, 1.0].
            This parameter tells the genetic programming algorithm how many pipelines to
            "breed" every generation. We recommend using the default parameter unless you
            understand how the mutation rate affects GP algorithms.
        scoring: function or str
            Function used to evaluate the quality of a given pipeline for the
            problem. By default, accuracy is used for classification problems and
            mean squared error (mse) for regression problems.
            TPOT assumes that this scoring function should be maximized, i.e.,
            higher is better.

            Offers the same options as sklearn.model_selection.cross_val_score as well as
            a built-in score "balanced_accuracy":

            ['accuracy', 'adjusted_rand_score', 'average_precision', 'balanced accuracy',
            'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted',
            'precision', 'precision_macro', 'precision_micro', 'precision_samples',
            'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro',
            'recall_samples', 'recall_weighted', 'roc_auc']
        cv: int (default: 5)
            Number of folds to evaluate each pipeline over in k-fold cross-validation
            during the TPOT optimization process.
        n_jobs: int (default: 1)
            Number of CPUs for evaluating pipelines in parallel during the TPOT
            optimization process. Assigning this to -1 will use as many cores as available
            on the computer.
        max_time_mins: int (default: None)
            How many minutes TPOT has to optimize the pipeline.
            If provided, this setting will override the "generations" parameter and allow
            TPOT to run until it runs out of time.
        max_eval_time_mins: int (default: 5)
            How many minutes TPOT has to optimize a single pipeline.
            Setting this parameter to higher values will allow TPOT to explore more
            complex pipelines, but will also allow TPOT to run longer.
        random_state: int (default: None)
            Random number generator seed for TPOT. Use this to make sure
            that TPOT will give you the same results each time you run it
            against the same data set with that seed.
        config_dict: string (default: None)
            Path for configuration file:
                A path to a configuration file for customizing the operators and parameters that
                TPOT uses in the optimization process.
                For examples, see config_regressor.py and config_classifier.py
            String 'TPOT light':
                TPOT uses a light version of operator configuration dictionary instead of
                the default one.
            String 'TPOT MDR':
                TPOT uses a list of TPOT-MDR operator configuration dictionary instead of
                the default one.
        warm_start: bool (default: False)
            Flag indicating whether the TPOT instance will reuse the population from
            previous calls to fit().
        verbosity: int (default: 0)
            How much information TPOT communicates while it's running.
            0 = none, 1 = minimal, 2 = high, 3 = all.
            A setting of 2 or higher will add a progress bar during the optimization procedure.
        disable_update_check: bool (default: False)
            Flag indicating whether the TPOT version checker should be disabled.

        Returns
        -------
        None

        """
        if self.__class__.__name__ == 'TPOTBase':
            raise RuntimeError(
                'Do not instantiate the TPOTBase class directly; use TPOTRegressor or TPOTClassifier instead.'
            )

        # Prompt the user if their version is out of date
        self.disable_update_check = disable_update_check
        if not self.disable_update_check:
            update_check('tpot', __version__)

        self._pareto_front = None
        self._optimized_pipeline = None
        self._fitted_pipeline = None
        self._pop = None
        self.warm_start = warm_start
        self.population_size = population_size
        self.generations = generations
        self.max_time_mins = max_time_mins
        self.max_eval_time_mins = max_eval_time_mins

        # Set offspring_size equal to population_size by default
        if offspring_size:
            self.offspring_size = offspring_size
        else:
            self.offspring_size = population_size

        if config_dict:
            if config_dict == 'TPOT light':
                if self.classification:
                    self.config_dict = classifier_config_dict_light
                else:
                    self.config_dict = regressor_config_dict_light
            elif config_dict == 'TPOT MDR':
                if self.classification:
                    self.config_dict = tpot_mdr_classifier_config_dict
                else:
                    raise TypeError(
                        'The TPOT MDR operator configuration file does not currently '
                        'work with TPOTRegressor. Please use TPOTClassifier instead.'
                    )
            else:
                try:
                    with open(config_dict, 'r') as input_file:
                        file_string = input_file.read()
                    operator_dict = eval(file_string[file_string.find('{'):(
                        file_string.rfind('}') + 1)])
                except:
                    raise TypeError(
                        'The operator configuration file is in a bad format or not available. '
                        'Please check the configuration file before running TPOT.'
                    )
        else:
            self.config_dict = self.default_config_dict

        self.operators = []
        self.arguments = []
        for key in sorted(self.config_dict.keys()):
            op_class, arg_types = TPOTOperatorClassFactory(
                key,
                self.config_dict[key],
                BaseClass=Operator,
                ArgBaseClass=ARGType)
            if op_class:
                self.operators.append(op_class)
                self.arguments += arg_types

        # Schedule TPOT to run for many generations if the user specifies a run-time limit
        # TPOT will automatically interrupt itself when the timer runs out
        if not (max_time_mins is None):
            self.generations = 1000000

        self.mutation_rate = mutation_rate
        self.crossover_rate = crossover_rate

        if self.mutation_rate + self.crossover_rate > 1:
            raise ValueError(
                'The sum of the crossover and mutation probabilities must be <= 1.0.'
            )

        self.verbosity = verbosity
        self.operators_context = {
            'make_pipeline': make_pipeline,
            'make_union': make_union,
            'VotingClassifier': VotingClassifier,
            'FunctionTransformer': FunctionTransformer,
            'copy': copy
        }

        self._pbar = None

        # Dictionary of individuals that have already been evaluated in previous generations
        self._evaluated_individuals = {}

        self.random_state = random_state

        # If the user passed a custom scoring function, store it in the sklearn SCORERS dictionary
        if scoring:
            if hasattr(scoring, '__call__'):
                scoring_name = scoring.__name__

                if 'loss' in scoring_name or 'error' in scoring_name:
                    greater_is_better = False
                else:
                    greater_is_better = True

                SCORERS[scoring_name] = make_scorer(
                    scoring, greater_is_better=greater_is_better)
                self.scoring_function = scoring_name
            else:
                if scoring not in SCORERS:
                    raise ValueError(
                        'The scoring function {} is not available. '
                        'Please choose a valid scoring function from the TPOT '
                        'documentation.'.format(scoring))
                self.scoring_function = scoring

        self.cv = cv
        # If the OS is windows, reset cpu number to 1 since the OS did not have multiprocessing module
        if sys.platform.startswith('win') and n_jobs != 1:
            print(
                'Warning: Although parallelization is currently supported in TPOT for Windows, '
                'pressing Ctrl+C will freeze the optimization process without saving the best pipeline!'
                'Thus, Please DO NOT press Ctrl+C during the optimization procss if n_jobs is not equal to 1.'
                'For quick test in Windows, please set n_jobs to 1 for saving the best pipeline '
                'in the middle of the optimization process via Ctrl+C.')
        if n_jobs == -1:
            self.n_jobs = cpu_count()
        else:
            self.n_jobs = n_jobs

        self._setup_pset()
        self._setup_toolbox()
Пример #32
0
def main():
    """Provide the entry point in the the modutils command."""
    mod_choices = ("banned", "contributors", "moderators")
    mod_choices_dsp = ", ".join(["`%s`" % x for x in mod_choices])
    msg = {
        "add": ("Add users to one of the following categories: %s" % mod_choices_dsp),
        "clear": "Remove users who have no flair set.",
        "css": "Ignore the CSS field when synchronizing flair.",
        "edit": "When adding flair templates, mark them as editable.",
        "file": "The file containing contents for --message",
        "flair": "List flair for the subreddit.",
        "flair_stats": "Display the number of users with each flair.",
        "json": "Output the results as json. Applies to --flair",
        "limit": (
            "The minimum number of users that must have the specified "
            "flair in order to add as a template. default: %default"
        ),
        "list": ("List the users in one of the following categories: " "%s. May be specified more than once.")
        % mod_choices_dsp,
        "msg": (
            "Send message to users of one of the following categories: "
            "%s. Message subject provided via --subject, content provided "
            "via --file or STDIN."
        )
        % mod_choices_dsp,
        "sort": (
            "The order to add flair templates. Available options are "
            "`alpha` to add alphabetically, and `size` to first add "
            "flair that is shared by the most number of users. "
            "default: %default"
        ),
        "static": (
            "Add this template when syncing flair templates. When "
            "syncing text and css use a comma to separate the two."
        ),
        "subject": "The subject of the message to send for --message.",
        "sync": "Synchronize flair templates with current user flair.",
        "text": "Ignore the text field when synchronizing flair.",
    }

    usage = "Usage: %prog [options] SUBREDDIT"
    parser = arg_parser(usage=usage)
    parser.add_option("-a", "--add", help=msg["add"])
    parser.add_option(
        "-l", "--list", action="append", help=msg["list"], choices=mod_choices, metavar="CATEGORY", default=[]
    )
    parser.add_option("-c", "--clear-empty", action="store_true", help=msg["clear"])
    parser.add_option("-F", "--file", help=msg["file"])
    parser.add_option("-f", "--flair", action="store_true", help=msg["flair"])
    parser.add_option("", "--flair-stats", action="store_true", help=msg["flair_stats"])
    parser.add_option("-m", "--message", choices=mod_choices, help=msg["msg"])
    parser.add_option("", "--subject", help=msg["subject"])

    group = OptionGroup(parser, "Format options")
    group.add_option("-j", "--json", action="store_true", help=msg["json"])
    parser.add_option_group(group)

    group = OptionGroup(parser, "Sync options")
    group.add_option("", "--sync", action="store_true", help=msg["sync"])
    group.add_option("-s", "--static", action="append", help=msg["static"])
    group.add_option("", "--editable", action="store_true", help=msg["edit"])
    group.add_option("", "--ignore-css", action="store_true", default=False, help=msg["css"])
    group.add_option("", "--ignore-text", action="store_true", default=False, help=msg["text"])
    group.add_option("", "--limit", type="int", help=msg["limit"], default=2)
    group.add_option("", "--sort", action="store", choices=("alpha", "size"), default="alpha", help=msg["sort"])
    parser.add_option_group(group)

    options, args = parser.parse_args()
    if options.pswd and not options.user:
        parser.error("Must provide --user when providing --pswd.")
    if len(args) == 0:
        parser.error("Must provide subreddit name.")
    if options.message and not options.subject:
        parser.error("Must provide --subject when providing --message.")
    subreddit = args[0]

    if not options.disable_update_check:  # Check for updates
        update_check("prawtools", __version__)

    modutils = ModUtils(subreddit, options.site, options.user, options.pswd, options.verbose)

    if options.add:
        modutils.add_users(options.add)
    if options.clear_empty:
        modutils.clear_empty()
    for category in options.list:
        modutils.output_list(category)
    if options.flair:
        modutils.output_current_flair(as_json=options.json)
    if options.flair_stats:
        modutils.output_flair_stats()
    if options.sync:
        modutils.flair_template_sync(
            editable=options.editable,
            limit=options.limit,
            static=options.static,
            sort=options.sort,
            use_css=not options.ignore_css,
            use_text=not options.ignore_text,
        )
    if options.message:
        modutils.message(options.message, options.subject, options.file)
Пример #33
0
def main():
    # parse the command-line options and arguments
    user, target, options = parse_cmd_line()

    # Check for package updates
    update_check(__name__, __version__)

    # open connection to Reddit
    r = praw.Reddit(user_agent="bot by /u/{0}".format(user),
                    disable_update_check=True)
    r.config.decode_html_entities = True

    # run analysis
    sys.stderr.write("Analyzing {0}\n".format(target))
    sys.stderr.flush()

    target = target[3:]

    if options.is_subreddit:
        processSubreddit(subreddit=r.get_subreddit(target),
                         period=options.period, limit=options.limit,
                         count_word_freqs=options.count_word_freqs,
                         max_threshold=options.max_threshold)
    else:
        processRedditor(redditor=r.get_redditor(target), limit=options.limit,
                        count_word_freqs=options.count_word_freqs,
                        max_threshold=options.max_threshold)

    # build a string containing all the words for the word cloud software
    output = ""

    # open output file to store the output string
    outFileName = target + ".csv"

    if options.is_subreddit:
        outFileName = "subreddit-" + outFileName
    else:
        outFileName = "user-" + outFileName

    outFile = open(outFileName, "w")

    # combine singular and plural forms of words into single count
    for word, count in popularWords.items():
        # e.g.: "picture" and "pictures"
        if word.endswith("s"):
            # if the singular form of the word was used
            singular = word[:-1]
            if popularWords[singular] > 0:

                # combine the count into the most-used form of the word
                if popularWords[singular] > count:
                    popularWords[singular] += popularWords[word]
                    del popularWords[word]
                else:
                    popularWords[word] += popularWords[singular]
                    del popularWords[singular]

        # e.g.: "furry" and "furries"
        if word.endswith("ies"):
            # if the singular form of the word was used
            singular = word[:-3] + "y"
            if popularWords[singular] > 0:
                # combine the count into the most-used form of the word
                if popularWords[singular] > count:
                    popularWords[singular] += popularWords[word]
                    del popularWords[word]
                else:
                    popularWords[word] += popularWords[singular]
                    del popularWords[singular]

    for word in sorted(popularWords, key=popularWords.get, reverse=True):
        # tweak this number depending on the subreddit
        # some subreddits end up having TONS of words and it seems to overflow
        # the Python string buffer
        if popularWords[word] > 5:
            pri = True

            # don't print the word if it's just a number
            if word.isdigit():
                pri = False

            # add as many copies of the word as it was mentioned in the
            # subreddit
            if pri:
                txt = word + ":" + str(popularWords[word]) + "\n"
                txt = txt.encode("UTF-8")
                output += txt
                outFile.write(txt)

    outFile.close()

    # print the series of words for the word cloud software
    # place this text into wordle.net
    if options.verbose:
        print(output)

    # save the raw word counts to a file
    if not options.no_raw_data:
        outFile = open("raw-" + outFileName, "w")
        for word in sorted(allWords, key=allWords.get, reverse=True):
            txt = word + ":" + str(allWords[word]) + "\n"
            txt = txt.encode("UTF-8")
            outFile.write(txt)
        outFile.close()
Пример #34
0
def autoclean(input_dataframe, drop_nans=False, copy=False, encoder=None,
              encoder_kwargs=None, ignore_update_check=False):
    """Performs a series of automated data cleaning transformations on the provided data set

    Parameters
    ----------
    input_dataframe: pandas.DataFrame
        Data set to clean
    drop_nans: bool
        Drop all rows that have a NaN in any column (default: False)
    copy: bool
        Make a copy of the data set (default: False)
    encoder: category_encoders transformer
        The a valid category_encoders transformer which is passed an inferred cols list. Default (None: LabelEncoder)
    encoder_kwargs: category_encoders
        The a valid sklearn transformer to encode categorical features. Default (None)
    ignore_update_check: bool
        Do not check for the latest version of datacleaner

    Returns
    ----------
    output_dataframe: pandas.DataFrame
        Cleaned data set

    """
    global update_checked
    if ignore_update_check:
        update_checked = True

    if not update_checked:
        update_check('datacleaner', __version__)
        update_checked = True

    if copy:
        input_dataframe = input_dataframe.copy()

    if drop_nans:
        input_dataframe.dropna(inplace=True)

    if encoder_kwargs is None:
        encoder_kwargs = {}

    print('columns to clean:')

    for column in input_dataframe.columns.values:
        print (column)
        # Replace NaNs with the median or mode of the column depending on the column type
        try:
            input_dataframe[column].fillna(input_dataframe[column].median(), inplace=True)
            print('median fill is used')
        except TypeError:
            most_frequent = input_dataframe[column].mode()
            # If the mode can't be computed, use the nearest valid value
            # See https://github.com/rhiever/datacleaner/issues/8
            if len(most_frequent) > 0:
                print('mode fill is used')
                input_dataframe[column].fillna(input_dataframe[column].mode()[0], inplace=True)
            else:
                print('bfill and ffill is used')
                input_dataframe[column].fillna(method='bfill', inplace=True)
                input_dataframe[column].fillna(method='ffill', inplace=True)

        # Encode all strings with numerical equivalents
        if str(input_dataframe[column].values.dtype) == 'object':
            if encoder is not None:
                print('encoder set by the user is used')
                column_encoder = encoder(**encoder_kwargs).fit(input_dataframe[column].values)
            else:
                print('default encoding method is used')
                column_encoder = LabelEncoder().fit(input_dataframe[column].values)

            input_dataframe[column] = column_encoder.transform(input_dataframe[column].values)
    print ('Done!')
    return input_dataframe
Пример #35
0
def main():
    # parse the command-line options and arguments
    user, target, options = parse_cmd_line()

    # Check for package updates
    update_check(__name__, __version__)

    # open connection to Reddit
    handler = None

    if options.multiprocess:
        from praw.handlers import MultiprocessHandler
        handler = MultiprocessHandler()

    reddit = praw.Reddit(
        user_agent="/u/{0} reddit analyzer".format(user), handler=handler)

    reddit.config.decode_html_entities = True

    # run analysis
    sys.stderr.write("Analyzing {0}\n".format(target))
    sys.stderr.flush()

    target = target[3:]

    if options.is_subreddit:
        process_subreddit(subreddit=reddit.get_subreddit(target),
                          period=options.period, limit=options.limit,
                          count_word_freqs=options.count_word_freqs,
                          max_threshold=options.max_threshold)
    else:
        process_redditor(redditor=reddit.get_redditor(target), limit=options.limit,
                         count_word_freqs=options.count_word_freqs,
                         max_threshold=options.max_threshold)

    # build a string containing all the words for the word cloud software
    output = ""

    # open output file to store the output string
    out_file_name = "{0}.csv".format(target)

    if options.is_subreddit:
        out_file_name = "subreddit-{0}".format(out_file_name)
    else:
        out_file_name = "user-{0}".format(out_file_name)

    out_file = open(out_file_name, "w")

    # combine singular and plural forms of words into single count
    for word in list(popular_words.keys()):
        count = popular_words[word]

        # e.g.: "picture" and "pictures"
        if word.endswith("s"):
            # if the singular form of the word was used
            singular = word[:-1]
            if popular_words[singular] > 0:

                # combine the count into the most-used form of the word
                if popular_words[singular] > count:
                    popular_words[singular] += popular_words[word]
                    del popular_words[word]
                else:
                    popular_words[word] += popular_words[singular]
                    del popular_words[singular]

        # e.g.: "furry" and "furries"
        if word.endswith("ies"):
            # if the singular form of the word was used
            singular = "{0}y".format(word[:-3])
            if popular_words[singular] > 0:
                # combine the count into the most-used form of the word
                if popular_words[singular] > count:
                    popular_words[singular] += popular_words[word]
                    del popular_words[word]
                else:
                    popular_words[word] += popular_words[singular]
                    del popular_words[singular]

    for word in sorted(popular_words, key=popular_words.get, reverse=True):
        # tweak this number depending on the subreddit
        # some subreddits end up having TONS of words and it seems to overflow
        # the Python string buffer
        if popular_words[word] > 5:
            pri = True

            # don't print the word if it's just a number
            if word.isdigit():
                pri = False

            # add as many copies of the word as it was mentioned in the
            # subreddit
            if pri:
                out_text = str("{0}:{1}\n".format(word, popular_words[word]))
                output += out_text
                out_file.write(out_text)

    out_file.close()

    # print the series of words for the word cloud software
    # place this text into wordle.net
    if options.verbose:
        print(output)

    # save the raw word counts to a file
    if not options.no_raw_data:
        out_file = open("raw-{0}".format(out_file_name), "w")
        for word in sorted(all_words, key=all_words.get, reverse=True):
            out_text = str("{0}:{1}\n".format(word, all_words[word]))
            out_file.write(out_text)
        out_file.close()
Пример #36
0
def main():
    """Provide the entry point in the the modutils command."""
    mod_choices = ('banned', 'contributors', 'moderators')
    mod_choices_dsp = ', '.join(['`%s`' % x for x in mod_choices])
    msg = {
        'add': ('Add users to one of the following categories: %s' %
                mod_choices_dsp),
        'clear': 'Remove users who have no flair set.',
        'css': 'Ignore the CSS field when synchronizing flair.',
        'edit': 'When adding flair templates, mark them as editable.',
        'file': 'The file containing contents for --message',
        'flair': 'List flair for the subreddit.',
        'flair_stats': 'Display the number of users with each flair.',
        'json': 'Output the results as json. Applies to --flair',
        'limit': ('The minimum number of users that must have the specified '
                  'flair in order to add as a template. default: %default'),
        'list': ('List the users in one of the following categories: '
                 '%s. May be specified more than once.') % mod_choices_dsp,
        'msg': ('Send message to users of one of the following categories: '
                '%s. Message subject provided via --subject, content provided '
                'via --file or STDIN.') % mod_choices_dsp,
        'sort': ('The order to add flair templates. Available options are '
                 '`alpha` to add alphabetically, and `size` to first add '
                 'flair that is shared by the most number of users. '
                 'default: %default'),
        'static': ('Add this template when syncing flair templates. When '
                   'syncing text and css use a comma to separate the two.'),
        'subject': 'The subject of the message to send for --message.',
        'sync': 'Synchronize flair templates with current user flair.',
        'text': 'Ignore the text field when synchronizing flair.'}

    usage = 'Usage: %prog [options] SUBREDDIT'
    parser = arg_parser(usage=usage)
    parser.add_option('-a', '--add', help=msg['add'])
    parser.add_option('-l', '--list', action='append', help=msg['list'],
                      choices=mod_choices, metavar='CATEGORY', default=[])
    parser.add_option('-c', '--clear-empty', action='store_true',
                      help=msg['clear'])
    parser.add_option('-F', '--file', help=msg['file'])
    parser.add_option('-f', '--flair', action='store_true', help=msg['flair'])
    parser.add_option('', '--flair-stats', action='store_true',
                      help=msg['flair_stats'])
    parser.add_option('-m', '--message', choices=mod_choices, help=msg['msg'])
    parser.add_option('', '--subject', help=msg['subject'])

    group = OptionGroup(parser, 'Format options')
    group.add_option('-j', '--json', action='store_true', help=msg['json'])
    parser.add_option_group(group)

    group = OptionGroup(parser, 'Sync options')
    group.add_option('', '--sync', action='store_true', help=msg['sync'])
    group.add_option('-s', '--static', action='append', help=msg['static'])
    group.add_option('', '--editable', action='store_true', help=msg['edit'])
    group.add_option('', '--ignore-css', action='store_true',
                     default=False, help=msg['css'])
    group.add_option('', '--ignore-text', action='store_true',
                     default=False, help=msg['text'])
    group.add_option('', '--limit', type='int', help=msg['limit'], default=2)
    group.add_option('', '--sort', action='store', choices=('alpha', 'size'),
                     default='alpha', help=msg['sort'])
    parser.add_option_group(group)

    options, args = parser.parse_args()
    if options.pswd and not options.user:
        parser.error('Must provide --user when providing --pswd.')
    if len(args) == 0:
        parser.error('Must provide subreddit name.')
    if options.message and not options.subject:
        parser.error('Must provide --subject when providing --message.')
    subreddit = args[0]

    if not options.disable_update_check:  # Check for updates
        update_check('prawtools', __version__)

    modutils = ModUtils(subreddit, options.site, options.user, options.pswd,
                        options.verbose)

    if options.add:
        modutils.add_users(options.add)
    if options.clear_empty:
        modutils.clear_empty()
    for category in options.list:
        modutils.output_list(category)
    if options.flair:
        modutils.output_current_flair(as_json=options.json)
    if options.flair_stats:
        modutils.output_flair_stats()
    if options.sync:
        modutils.flair_template_sync(editable=options.editable,
                                     limit=options.limit,
                                     static=options.static, sort=options.sort,
                                     use_css=not options.ignore_css,
                                     use_text=not options.ignore_text)
    if options.message:
        modutils.message(options.message, options.subject, options.file)
Пример #37
0
def xrff2csv(input_filename,
             output_filename=None,
             sep='\t',
             ignore_update_check=False):
    """Converts the provided XRFF file to CSV format

    If `output_filename` is not specified, the function will print the result

    Parameters
    ----------
    input_filename: str
        Name of the XRFF file to convert
    output_filename: str
        Name of the CSV file to output to (default: None)
    sep: str
        String to use as the separator in the CSV file (default: \t)
    ignore_update_check: bool
        Do not check for the latest version of xrff2csv

    Returns
    ----------
    None

    """
    global update_checked
    if ignore_update_check:
        update_checked = True

    if not update_checked:
        update_check('xrff2csv', __version__)
        update_checked = True
    with open(input_filename, 'r') as in_file:
        headers = []
        values = []
        for line in in_file:
            if 'attribute name=' in line:
                headers.append(line.split('"')[1])
            elif '<body>' in line:
                # Beginning of data entries has been encountered, so output the headers
                headers = sep.join(headers)
                if output_filename is not None:
                    with open(output_filename, 'w') as out_file:
                        out_file.write(headers)
                else:
                    sys.stdout.write(headers)
                # No need to store the headers in memory any more
                del headers
            elif '<value>' in line:
                values.append(line.split('>')[1].split('<')[0])
            elif '</instance>' in line:
                # End of data instance reached, so output it
                values = os.linesep + sep.join(values)
                if output_filename is not None:
                    with open(output_filename, 'a') as out_file:
                        out_file.write(values)
                else:
                    sys.stdout.write(values)
                # No need to store this data instance in memory any more
                values = []

    if output_filename is not None:
        with open(output_filename, 'a') as out_file:
            out_file.write(os.linesep)
    else:
        sys.stdout.write(os.linesep)
Пример #38
0
def autoclean_cv(training_dataframe, testing_dataframe, drop_nans=False, copy=False,
                 encoder=None, encoder_kwargs=None, ignore_update_check=False):
    """Performs a series of automated data cleaning transformations on the provided training and testing data sets

    Unlike `autoclean()`, this function takes cross-validation into account by learning the data transformations
    from only the training set, then applying those transformations to both the training and testing set.
    By doing so, this function will prevent information leak from the training set into the testing set.

    Parameters
    ----------
    training_dataframe: pandas.DataFrame
        Training data set
    testing_dataframe: pandas.DataFrame
        Testing data set
    drop_nans: bool
        Drop all rows that have a NaN in any column (default: False)
    copy: bool
        Make a copy of the data set (default: False)
    encoder: category_encoders transformer
        The a valid category_encoders transformer which is passed an inferred cols list. Default (None: LabelEncoder)
    encoder_kwargs: category_encoders
        The a valid sklearn transformer to encode categorical features. Default (None)
    ignore_update_check: bool
        Do not check for the latest version of datacleaner

    Returns
    ----------
    output_training_dataframe: pandas.DataFrame
        Cleaned training data set
    output_testing_dataframe: pandas.DataFrame
        Cleaned testing data set

    """
    global update_checked
    if ignore_update_check:
        update_checked = True

    if not update_checked:
        update_check('datacleaner', __version__)
        update_checked = True

    if set(training_dataframe.columns.values) != set(testing_dataframe.columns.values):
        raise ValueError('The training and testing DataFrames do not have the same columns. '
                         'Make sure that you are providing the same columns.')

    if copy:
        training_dataframe = training_dataframe.copy()
        testing_dataframe = testing_dataframe.copy()
    
    if drop_nans:
        training_dataframe.dropna(inplace=True)
        testing_dataframe.dropna(inplace=True)

    if encoder_kwargs is None:
        encoder_kwargs = {}

    for column in training_dataframe.columns.values:
        # Replace NaNs with the median or mode of the column depending on the column type
        try:
            column_median = training_dataframe[column].median()
            training_dataframe[column].fillna(column_median, inplace=True)
            testing_dataframe[column].fillna(column_median, inplace=True)
        except TypeError:
            column_mode = training_dataframe[column].mode()[0]
            training_dataframe[column].fillna(column_mode, inplace=True)
            testing_dataframe[column].fillna(column_mode, inplace=True)

        # Encode all strings with numerical equivalents
        if str(training_dataframe[column].values.dtype) == 'object':
            if encoder is not None:
                column_encoder = encoder(**encoder_kwargs).fit(training_dataframe[column].values)
            else:
                column_encoder = LabelEncoder().fit(training_dataframe[column].values)

            training_dataframe[column] = column_encoder.transform(training_dataframe[column].values)
            testing_dataframe[column] = column_encoder.transform(testing_dataframe[column].values)

    return training_dataframe, testing_dataframe
Пример #39
0
def check_for_updates(options):
    """Check for package updates."""
    if not options.disable_update_check:  # Check for updates
        update_check('prawtools', __version__)
Пример #40
0
    def __init__(self,
                 population_size=100,
                 generations=100,
                 mutation_rate=0.9,
                 crossover_rate=0.05,
                 random_state=None,
                 verbosity=0,
                 scoring_function=None,
                 num_cv_folds=3,
                 disable_update_check=False):
        """Sets up the genetic programming algorithm for pipeline optimization.

        Parameters
        ----------
        population_size: int (default: 100)
            The number of pipelines in the genetic algorithm population. Must
            be > 0.The more pipelines in the population, the slower TPOT will
            run, but it's also more likely to find better pipelines.
        generations: int (default: 100)
            The number of generations to run pipeline optimization for. Must
            be > 0. The more generations you give TPOT to run, the longer it
            takes, but it's also more likely to find better pipelines.
        mutation_rate: float (default: 0.9)
            The mutation rate for the genetic programming algorithm in the range
            [0.0, 1.0]. This tells the genetic programming algorithm how many
            pipelines to apply random changes to every generation. We don't
            recommend that you tweak this parameter unless you know what you're
            doing.
        crossover_rate: float (default: 0.05)
            The crossover rate for the genetic programming algorithm in the
            range [0.0, 1.0]. This tells the genetic programming algorithm how
            many pipelines to "breed" every generation. We don't recommend that
            you tweak this parameter unless you know what you're doing.
        random_state: int (default: 0)
            The random number generator seed for TPOT. Use this to make sure
            that TPOT will give you the same results each time you run it
            against the same data set with that seed.
        verbosity: int (default: 0)
            How much information TPOT communicates while it's running.
            0 = none, 1 = minimal, 2 = all
        scoring_function: str (default: balanced accuracy)
            Function used to evaluate the goodness of a given pipeline for the
            classification problem. By default, balanced class accuracy is used.
            TPOT assumes that this scoring function should be maximized, i.e.,
            higher is better.

            Offers the same options as sklearn.cross_validation.cross_val_score:

            ['accuracy', 'adjusted_rand_score', 'average_precision', 'f1', 'f1_macro',
            'f1_micro', 'f1_samples', 'f1_weighted', 'log_loss', 'precision', 'precision_macro',
            'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall',
            'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc']
        num_cv_folds: int (default: 3)
            The number of folds to evaluate each pipeline over in k-fold cross-validation
            during the TPOT pipeline optimization process
        disable_update_check: bool (default: False)
            Flag indicating whether the TPOT version checker should be disabled.

        Returns
        -------
        None

        """
        # Save params to be recalled later by get_params()
        self.params = locals()  # Must be before any local variable definitions
        self.params.pop('self')

        # Prompt the user if their version is out of date
        if not disable_update_check:
            update_check('tpot', __version__)

        self.hof = None
        self._optimized_pipeline = None
        self._fitted_pipeline = None
        self.population_size = population_size
        self.generations = generations
        self.mutation_rate = mutation_rate
        self.crossover_rate = crossover_rate
        self.verbosity = verbosity
        self.operators_context = {
            'make_pipeline': make_pipeline,
            'make_union': make_union,
            'VotingClassifier': VotingClassifier,
            'FunctionTransformer': FunctionTransformer
        }

        self.pbar = None
        self.gp_generation = 0
        self.random_state = random_state

        if scoring_function is None:
            self.scoring_function = self._balanced_accuracy
        else:
            self.scoring_function = scoring_function

        self.num_cv_folds = num_cv_folds

        self._setup_pset()
        self._setup_toolbox()
Пример #41
0
 def _check_for_update(self):
     if UPDATE_CHECKER_MISSING:
         return
     if not Reddit.update_checked and self.config.check_for_updates:
         update_check(__package__, __version__)
         Reddit.update_checked = True
Пример #42
0
def xrff2csv(input_filename, output_filename=None, sep='\t', ignore_update_check=False):
    """Converts the provided XRFF file to CSV format

    If `output_filename` is not specified, the function will print the result

    Parameters
    ----------
    input_filename: str
        Name of the XRFF file to convert
    output_filename: str
        Name of the CSV file to output to (default: None)
    sep: str
        String to use as the separator in the CSV file (default: \t)
    ignore_update_check: bool
        Do not check for the latest version of xrff2csv

    Returns
    ----------
    None

    """
    global update_checked
    if ignore_update_check:
        update_checked = True

    if not update_checked:
        update_check('xrff2csv', __version__)
        update_checked = True
    with open(input_filename, 'r') as in_file:
        headers = []
        values = []
        for line in in_file:
            if 'attribute name=' in line:
                headers.append(line.split('"')[1])
            elif '<body>' in line:
                # Beginning of data entries has been encountered, so output the headers
                headers = sep.join(headers)
                if output_filename is not None:
                    with open(output_filename, 'w') as out_file:
                        out_file.write(headers)
                else:
                    sys.stdout.write(headers)
                # No need to store the headers in memory any more
                del headers
            elif '<value>' in line:
                values.append(line.split('>')[1].split('<')[0])
            elif '</instance>' in line:
                # End of data instance reached, so output it
                values = os.linesep + sep.join(values)
                if output_filename is not None:
                    with open(output_filename, 'a') as out_file:
                        out_file.write(values)
                else:
                    sys.stdout.write(values)
                # No need to store this data instance in memory any more
                values = []

    if output_filename is not None:
        with open(output_filename, 'a') as out_file:
            out_file.write(os.linesep)
    else:
        sys.stdout.write(os.linesep)
Пример #43
0
def main():
    """Provide the entry point to the subreddit_stats command.

    :returns: 0 on success, 1 otherwise

    """
    parser = arg_parser(usage='usage: %prog [options] [SUBREDDIT]')
    parser.add_option('-s', '--submitters', type='int', default=5,
                      help='Number of top submitters to display '
                      '[default %default]')
    parser.add_option('-c', '--commenters', type='int', default=10,
                      help='Number of top commenters to display '
                      '[default %default]')
    parser.add_option('-a', '--after',
                      help='Submission ID to fetch after')
    parser.add_option('-d', '--days', type='int', default=32,
                      help=('Number of previous days to include submissions '
                            'from. Use 0 for unlimited. Default: %default'))
    parser.add_option('-D', '--debug', action='store_true',
                      help='Enable debugging mode. Does not post stats.')
    parser.add_option('-R', '--submission-reddit',
                      help=('Subreddit to submit to. If not present, '
                            'submits to the subreddit processed'))
    parser.add_option('-t', '--top',
                      help=('Run on top submissions either by day, week, '
                            'month, year, or all'))
    parser.add_option('', '--distinguished', action='store_true',
                      help=('Include distinguished subissions and '
                            'comments (default: False). Note that regular '
                            'comments of distinguished submissions will still '
                            'be included.'))
    parser.add_option('', '--no-self', action='store_true',
                      help=('Do not include self posts (and their comments) in'
                            ' the calculation.'))
    parser.add_option('', '--no-link', action='store_true',
                      help=('Only include self posts (and their comments) in '
                            'the calculation.'))
    parser.add_option('', '--prev',
                      help='Statically provide the URL of previous SRS page.')
    parser.add_option('', '--include-prev', action='store_true',
                      help='Don\'t try to avoid overlap with a previous SRS.')
    parser.add_option('-o', '--output',
                      help='Save result csv to named file.')

    options, args = parser.parse_args()
    if len(args) != 1:
        sys.stdout.write('Enter subreddit name: ')
        sys.stdout.flush()
        subject_reddit = sys.stdin.readline().strip()
        if not subject_reddit:
            parser.error('No subreddit name entered')
    else:
        subject_reddit = args[0]

    if not options.disable_update_check:  # Check for updates
        update_check('prawtools', __version__)

    print('You chose to analyze this subreddit: {0}'.format(subject_reddit))

    if options.no_link and options.no_self:
        parser.error('You are choosing to exclude self posts but also only '
                     'include self posts. Consider checking your arguments.')

    if options.submission_reddit:
        submission_reddit = options.submission_reddit
    else:
        submission_reddit = subject_reddit

    srs = SubRedditStats(subject_reddit, options.site, options.verbose,
                         options.distinguished)
    srs.login(options.user, options.pswd)
    if options.prev:
        srs.prev_stat(options.prev)
    if options.top:
        found = srs.fetch_top_submissions(options.top, options.no_self,
                                          options.no_link)
    else:
        since_last = not options.include_prev
        found = srs.fetch_recent_submissions(max_duration=options.days,
                                             after=options.after,
                                             exclude_self=options.no_self,
                                             exclude_link=options.no_link,
                                             since_last=since_last)
    if not found:
        print('No submissions were found.')
        return 1
    srs.process_submitters()
    if options.commenters > 0:
        srs.process_commenters()
    if options.output:
        srs.save_csv(options.output)
    srs.publish_results(submission_reddit, options.submitters,
                        options.commenters, 5, 5, options.top, options.debug)
Пример #44
0
def main():
    """Provide the entry point in the the modutils command."""
    mod_choices = ('banned', 'contributors', 'moderators')
    mod_choices_dsp = ', '.join(['`%s`' % x for x in mod_choices])
    msg = {
        'add':
        ('Add users to one of the following categories: %s' % mod_choices_dsp),
        'clear':
        'Remove users who have no flair set.',
        'css':
        'Ignore the CSS field when synchronizing flair.',
        'edit':
        'When adding flair templates, mark them as editable.',
        'file':
        'The file containing contents for --message',
        'flair':
        'List flair for the subreddit.',
        'flair_stats':
        'Display the number of users with each flair.',
        'json':
        'Output the results as json. Applies to --flair',
        'limit': ('The minimum number of users that must have the specified '
                  'flair in order to add as a template. default: %default'),
        'list': ('List the users in one of the following categories: '
                 '%s. May be specified more than once.') % mod_choices_dsp,
        'msg': ('Send message to users of one of the following categories: '
                '%s. Message subject provided via --subject, content provided '
                'via --file or STDIN.') % mod_choices_dsp,
        'sort': ('The order to add flair templates. Available options are '
                 '`alpha` to add alphabetically, and `size` to first add '
                 'flair that is shared by the most number of users. '
                 'default: %default'),
        'static': ('Add this template when syncing flair templates. When '
                   'syncing text and css use a comma to separate the two.'),
        'subject':
        'The subject of the message to send for --message.',
        'sync':
        'Synchronize flair templates with current user flair.',
        'text':
        'Ignore the text field when synchronizing flair.'
    }

    usage = 'Usage: %prog [options] SUBREDDIT'
    parser = arg_parser(usage=usage)
    parser.add_option('-a', '--add', help=msg['add'])
    parser.add_option('-l',
                      '--list',
                      action='append',
                      help=msg['list'],
                      choices=mod_choices,
                      metavar='CATEGORY',
                      default=[])
    parser.add_option('-c',
                      '--clear-empty',
                      action='store_true',
                      help=msg['clear'])
    parser.add_option('-F', '--file', help=msg['file'])
    parser.add_option('-f', '--flair', action='store_true', help=msg['flair'])
    parser.add_option('',
                      '--flair-stats',
                      action='store_true',
                      help=msg['flair_stats'])
    parser.add_option('-m', '--message', choices=mod_choices, help=msg['msg'])
    parser.add_option('', '--subject', help=msg['subject'])

    group = OptionGroup(parser, 'Format options')
    group.add_option('-j', '--json', action='store_true', help=msg['json'])
    parser.add_option_group(group)

    group = OptionGroup(parser, 'Sync options')
    group.add_option('', '--sync', action='store_true', help=msg['sync'])
    group.add_option('-s', '--static', action='append', help=msg['static'])
    group.add_option('', '--editable', action='store_true', help=msg['edit'])
    group.add_option('',
                     '--ignore-css',
                     action='store_true',
                     default=False,
                     help=msg['css'])
    group.add_option('',
                     '--ignore-text',
                     action='store_true',
                     default=False,
                     help=msg['text'])
    group.add_option('', '--limit', type='int', help=msg['limit'], default=2)
    group.add_option('',
                     '--sort',
                     action='store',
                     choices=('alpha', 'size'),
                     default='alpha',
                     help=msg['sort'])
    parser.add_option_group(group)

    options, args = parser.parse_args()
    if options.pswd and not options.user:
        parser.error('Must provide --user when providing --pswd.')
    if len(args) == 0:
        parser.error('Must provide subreddit name.')
    if options.message and not options.subject:
        parser.error('Must provide --subject when providing --message.')
    subreddit = args[0]

    if not options.disable_update_check:  # Check for updates
        update_check('prawtools', __version__)

    modutils = ModUtils(subreddit, options.site, options.user, options.pswd,
                        options.verbose)

    if options.add:
        modutils.add_users(options.add)
    if options.clear_empty:
        modutils.clear_empty()
    for category in options.list:
        modutils.output_list(category)
    if options.flair:
        modutils.output_current_flair(as_json=options.json)
    if options.flair_stats:
        modutils.output_flair_stats()
    if options.sync:
        modutils.flair_template_sync(editable=options.editable,
                                     limit=options.limit,
                                     static=options.static,
                                     sort=options.sort,
                                     use_css=not options.ignore_css,
                                     use_text=not options.ignore_text)
    if options.message:
        modutils.message(options.message, options.subject, options.file)