Python Wordfilter примеры, wordfilter.Wordfilter Python примеры использования

Пример #1

0

Показать файл

Файл: twitter_markov.py Проект: celesteh/twitter_markov

    def __init__(self, screen_name, corpus=None, **kwargs):
        '''
        :screen_name User name to post as
        :corpus Text file to read to generate text.
        :api tweepy.API object
        :dry_run boolean If set, TwitterMarkov won't actually post tweets.
        '''
        if 'api' in kwargs:
            self.api = kwargs.pop('api')
        else:
            self.api = tbu.API(screen_name=screen_name, **kwargs)

        try:
            self.log = self.api.logger
        except AttributeError:
            self.log = logging.getLogger(screen_name)

        self.screen_name = screen_name
        self.config = self.api.config

        self.dry_run = kwargs.pop('dry_run', False)

        try:
            corpus = corpus or self.config.get('corpus')

            if isinstance(corpus, basestring):
                corpora = [corpus]

            elif isinstance(corpus, Iterable):
                corpora = corpus

            else:
                raise RuntimeError('Unable to find any corpora!')

            self.corpora = [b for b in corpora if b is not None]

            self.log.debug('%s, %s', screen_name, self.corpora)

            state_size = kwargs.get('state_size', self.config.get('state_size'))

            self.models = self._setup_models(self.corpora, state_size)

        except RuntimeError as e:
            self.log.error(e)
            raise e

        self.log.debug('models: %s', list(self.models.keys()))

        blacklist = kwargs.get('blacklist') or self.config.get('blacklist', [])
        self.wordfilter = Wordfilter()
        self.wordfilter.add_words(blacklist)

        self.last_tweet = (self.api.user_timeline(count=1))[0]
        self.last_tweet = self.last_tweet.id

        if kwargs.get('learn', True):
            self.learn_parent()

Пример #2

0

Показать файл

    def __init__(self, screen_name, corpus=None, **kwargs):
        if 'api' in kwargs:
            self.api = kwargs.pop('api')
        else:
            self.api = tbu.API(screen_name=screen_name, **kwargs)

        try:
            self.log = self.api.logger
        except AttributeError:
            self.log = logging.getLogger(screen_name)

        self.screen_name = screen_name
        self.config = self.api.config
        self.dry_run = kwargs.pop('dry_run', False)

        self.log.debug('screen name: %s', screen_name)
        self.log.debug("dry run: %s", self.dry_run)

        try:
            corpus = corpus or self.config.get('corpus')

            if isinstance(corpus, six.string_types):
                corpora = [corpus]

            elif isinstance(corpus, Iterable):
                corpora = corpus

            else:
                raise RuntimeError('Unable to find any corpora!')

            self.corpora = [b for b in corpora if b is not None]

            state_size = kwargs.get('state_size',
                                    self.config.get('state_size'))

            self.models = self._setup_models(self.corpora, state_size)

        except RuntimeError as e:
            self.log.error(e)
            raise e

        self.log.debug('models: %s', list(self.models.keys()))

        blacklist = kwargs.get('blacklist') or self.config.get('blacklist', [])
        self.wordfilter = Wordfilter()
        self.wordfilter.add_words(blacklist)

        self.log.debug('blacklist: %s terms', len(self.wordfilter.blacklist))

        if kwargs.get('learn', True):
            self.log.debug('learning...')
            self.learn_parent()

Пример #3

0

Показать файл

    def __init__(self, channels, nickname, server, port, owner, usessl,
                 password, engine_host, engine_port, api_key, respond):

        # Initialize the class' attributes.
        for i in channels:
            self.joined_channels[i] = 1
        self.canonical_name = nick
        self.nick = nick
        self.owner = owner
        self.server = server
        self.port = port
        self.password = password
        self.authenticated = False
        self.usessl = usessl
        self.engine = 'http://' + engine_host + ':' + engine_port
        self.api_key = api_key
        self.wordfilter = Wordfilter()
        self.respond = respond
        self.ghost = False

        # Connection factory object handle.
        factory = ""

        # If SSL/TLS support is requested, pass the ssl.wrap_socket() method
        # as a keyword argument.
        if self.usessl:
            logger.debug("Constructing SSL/TLS server connector.")
            factory = irc.connection.Factory(wrapper=ssl.wrap_socket)
        else:
            logger.debug("Constructing plaintext server connector.")
            factory = irc.connection.Factory()

        # Initialize an instance of this class by running the parent class'
        # default initializer method.
        #
        # [(server, port)] can be a list of one or more (server, port) tuples
        # because it can connect to more than one at once.
        # The other two arguments are the bot's nickname and realname.
        logger.debug("Instantiating SingleServerIRCBot superclass.")
        irc.bot.SingleServerIRCBot.__init__(self, [(self.server, self.port)],
                                            self.nick,
                                            self.nick,
                                            connect_factory=factory)
        logger.debug("Channels configured for this bot:")
        logger.debug("  " + str(self.joined_channels))

Пример #4

0

Показать файл

    def __init__(
            self,
            bible: biblemunger.Bible,
            favdict,  #: list[dict],
            apptitle: str,
            appsubtitle: str,
            dbpath: str,
            wordfilter: bool):

        self.bible = bible
        self.apptitle = apptitle
        self.appsubtitle = appsubtitle
        self.dbpath = dbpath

        if wordfilter:
            from wordfilter import Wordfilter
            self.wordfilter = Wordfilter()
            self.wordfilter.add_words(['QwertyStringUsedForTestingZxcvb'])
        else:
            self.wordfilter = False

        deploymentinfofile = os.path.join(scriptdir, 'deploymentinfo.txt')
        if os.path.exists(deploymentinfofile):
            with open(deploymentinfofile) as df:
                self.deploymentinfo = df.read()
        else:
            self.deploymentinfo = "development version"

        # TODO: refactor this, just use a dictionary directly elsewhere
        self.favorite_searches = []
        for key in favdict.keys():
            self.favorite_searches += [{
                'search': key,
                'replace': favdict[key]
            }]

        conn = sqlite3.connect(self.dbpath)
        c = conn.cursor()
        c.execute(
            "select name from sqlite_master where type='table' and name='recent_searches'"
        )
        if not c.fetchone():
            self.initialize_database()

Пример #5

0

Показать файл

    def clean_description(self):
        desc = self.cleaned_data['description']
        if settings.TESTING:
            check_spam = False
        else:
            akismet = Akismet(settings.AKISMET_KEY, blog="CC Search")
            check_spam = akismet.check(
                self.request.get_host(),
                user_agent=self.request.META.get('user-agent'),
                comment_author=self.request.user.username,
                comment_content=desc)
        wordfilter = Wordfilter()
        check_words = wordfilter.blacklisted(desc)
        if check_spam or check_words:
            raise forms.ValidationError(
                "This description failed our spam or profanity check; the description has not been updated."
            )

        return desc

Пример #6

0

Показать файл

def botechre(times=1):
    corpus = {}
    try:
        with codecs.open(CORPUS_FILENAME, encoding='utf-8') as fp:
            corpus = json.load(fp)
    except IOError:
        sys.stderr.write('File not found: %s\n' % CORPUS_FILENAME)
        sys.stderr.write('Run %s first.\n' % 'build.py')
        return

    assembler = BotechreAssembler(corpus)
    wordfilter = Wordfilter()
    max_times = times * 10

    for i in range(max_times):
        title = random_title(assembler)
        if not wordfilter.blacklisted(title):
            yield title
            if times <= i:
                break

Пример #7

0

Показать файл

Файл: twitter_markov.py Проект: fitnr/twitter_markov

    def __init__(self, screen_name, corpus=None, **kwargs):
        if 'api' in kwargs:
            self.api = kwargs.pop('api')
        else:
            self.api = tbu.API(screen_name=screen_name, **kwargs)

        try:
            self.log = self.api.logger
        except AttributeError:
            self.log = logging.getLogger(screen_name)

        self.screen_name = screen_name
        self.config = self.api.config
        self.dry_run = kwargs.pop('dry_run', False)

        self.log.debug('screen name: %s', screen_name)
        self.log.debug("dry run: %s", self.dry_run)

        try:
            corpus = corpus or self.config.get('corpus')

            if isinstance(corpus, six.string_types):
                corpora = [corpus]

            elif isinstance(corpus, Iterable):
                corpora = corpus

            else:
                raise RuntimeError('Unable to find any corpora!')

            self.corpora = [b for b in corpora if b is not None]

            state_size = kwargs.get('state_size', self.config.get('state_size'))

            self.models = self._setup_models(self.corpora, state_size)

        except RuntimeError as e:
            self.log.error(e)
            raise e

        self.log.debug('models: %s', list(self.models.keys()))

        blacklist = kwargs.get('blacklist') or self.config.get('blacklist', [])
        self.wordfilter = Wordfilter()
        self.wordfilter.add_words(blacklist)

        self.log.debug('blacklist: %s terms', len(self.wordfilter.blacklist))

        if kwargs.get('learn', True):
            self.log.debug('learning...')
            self.learn_parent()

Пример #8

0

Показать файл

Файл: twitter_markov.py Проект: madmatah/twitter_markov

    def __init__(self, screen_name, brains=None, **kwargs):

        self.screen_name = screen_name

        self.api = kwargs.get('api', tbu.api.API(screen_name, **kwargs))

        self.config = kwargs.get('config', self.api.config)

        self.logger = logging.getLogger(screen_name)

        try:
            if isinstance(brains, str):
                brains = [brains]

            if not isinstance(brains, list):
                brain = self.config.get('brain', [])
                brains = brain + self.config.get('brains', [])

            if not brains:
                raise RuntimeError

            self.brains = self._setup_brains(brains)

        except (IOError, IndexError, RuntimeError) as e:
            self.logger.error('Feed me brains: unable to find any brains!')
            raise e

        self.logger.debug('Brains: {0}'.format(list(self.brains.keys())))

        self.dry_run = kwargs.get('dry_run', False)

        self.wordfilter = Wordfilter()
        self.wordfilter.add_words(self.config.get('blacklist', []))

        self.checker = checking.construct_tweet_checker(
            no_retweets=self.config.get('no_retweets'),
            no_replies=self.config.get('no_replies')
        )

        if kwargs.get('learn', True):
            self.learn_parent()

Пример #9

0

Показать файл

Файл: textproject_server.py Проект: richardassar/sentence-space

import theano
import theano.tensor as T
from databases.textproject_reconstruction_database import TextProjectReconstructionDatabase

from nn.containers import Sequential
from nn.rnns import LNLSTM
from nn.layers import OneHot
from nn.utils import Vocabulary
import nn.utils

from lm_vae import Sampler
from lm_vae_sample import LNLSTMStep
from textproject_vae_charlevel import make_model

from wordfilter import Wordfilter
wordfilter = Wordfilter()

t1 = time.time()

session = "sp15_trial"

vocab = Vocabulary()

if os.path.exists("session/%s/vocab.pkl" % session):
    with open("session/%s/vocab.pkl" % session) as vocab_file:
        vocab = pickle.load(vocab_file)
        print("Loaded vocab with %i chars:" % len(vocab))
        #print(vocab.index_to_word)
else:
    print("Using default 256-char vocab")
    # old-school

Пример #10

0

Показать файл

Файл: common.py Проект: little-wow/picdescbot

import json
import re
import requests
import time
import lxml.html
from . import logger
from io import BytesIO

log = logger.get("common")

MEDIAWIKI_API = "https://commons.wikimedia.org/w/api.php"

HEADERS = {"User-Agent": "picdescbot, http://github.com/elad661/picdescbot"}

supported_formats = re.compile('\.(png|jpe?g|gif)$', re.I)
word_filter = Wordfilter()

# I really don't want the bot to show this kind of imagery!
word_filter.add_words(['nazi', 'hitler', 'reich'])

# I can't trust Microsoft's algorithm to not be racist, so I should probably
# make the bot avoid posting images with the following words in them.
# I'm not using wordfilter here because it would over-filter in some cases.
# also filter "gun" because this is not the kind of content I want the bot to post
# This is matched only against the caption generated by CVAPI.
extra_filter = {'ape', 'apes', 'monkey', 'monkeys', 'gun'}

# Blacklisted phrases (instead of words) to blacklist certain phrases
# in the wikimedia description
blacklisted_phrases = {
    'comic strip', 'logo', 'biblical illustration', 'church',

Пример #11

0

Показать файл

class DixieBot(irc.bot.SingleServerIRCBot):

    # Class-level variables which form attributes.  These all refer to aspects
    # of the bot.
    joined_channels = IRCDict()
    canonical_name = ""
    nick = ""
    owner = ""

    # Connection information.
    server = ""
    port = 0

    # The bot's owner's authentication password.
    password = ""

    # Is the bot's owner authenticated or not?
    authenticated = ""

    # Whether or not the connection is SSL/TLS encrypted or not.
    usessl = ""

    # Response engine's hostname and port.
    engine = ""

    # Bot's API key to interface with the response engine.
    api_key = ""

    # One instance of wordfilter.Wordfilter() to rule them all...
    wordfilter = None

    # Whether or not to use the conversation engine to respond?
    respond = None

    # Whether or not the bot's owner can speak through the bot by using
    # private messages.  By default, the bot doesn't let you do that.
    ghost = None

    # Methods on the connection object to investigate:
    # connect() - Connect to a server?
    # connected() -
    # disconnect() -
    # get_nickname() -
    # get_server_name() -
    # info() -
    # ircname() -
    # is_connected() - See if the connection is still up?
    # part() - Leave channel?
    # privmsg() - Send privmsg?
    # quit() - Terminate IRC connection?
    # reconnect() - Reconnect to server?
    # send_raw() -
    # stats() -
    # time() -

    def __init__(self, channels, nickname, server, port, owner, usessl,
                 password, engine_host, engine_port, api_key, respond):

        # Initialize the class' attributes.
        for i in channels:
            self.joined_channels[i] = 1
        self.canonical_name = nick
        self.nick = nick
        self.owner = owner
        self.server = server
        self.port = port
        self.password = password
        self.authenticated = False
        self.usessl = usessl
        self.engine = 'http://' + engine_host + ':' + engine_port
        self.api_key = api_key
        self.wordfilter = Wordfilter()
        self.respond = respond
        self.ghost = False

        # Connection factory object handle.
        factory = ""

        # If SSL/TLS support is requested, pass the ssl.wrap_socket() method
        # as a keyword argument.
        if self.usessl:
            logger.debug("Constructing SSL/TLS server connector.")
            factory = irc.connection.Factory(wrapper=ssl.wrap_socket)
        else:
            logger.debug("Constructing plaintext server connector.")
            factory = irc.connection.Factory()

        # Initialize an instance of this class by running the parent class'
        # default initializer method.
        #
        # [(server, port)] can be a list of one or more (server, port) tuples
        # because it can connect to more than one at once.
        # The other two arguments are the bot's nickname and realname.
        logger.debug("Instantiating SingleServerIRCBot superclass.")
        irc.bot.SingleServerIRCBot.__init__(self, [(self.server, self.port)],
                                            self.nick,
                                            self.nick,
                                            connect_factory=factory)
        logger.debug("Channels configured for this bot:")
        logger.debug("  " + str(self.joined_channels))

    # This method fires if the configured nickname is already in use.  If that
    # happens, change the bot's nick slightly.
    # Note that the name of this method is specifically what the irc module
    # looks for.
    def on_nicknameinuse(self, connection, event):
        logger.info("Bot nickname " + self.nick +
                    " is already taken.  Falling back to bot nickname " +
                    self.nick + "_.")
        connection.privmsg(
            self.owner, self.nick +
            " seems to be taken already.  Falling back to nickname " +
            self.nick + "_.")
        connection.nick(connection.get_nickname() + "_")

    # This method fires when the server accepts the bot's connection.  It walks
    # through the IRCDict of channels and tries to join each one.
    def on_welcome(self, connection, event):
        logger.debug("Entered DixieBot.on_welcome().")
        for channel in self.joined_channels:
            logger.debug("Trying to join channel " + channel + ".")
            connection.join(channel)
            logger.info("Joined channel " + channel + ".")
            connection.privmsg(self.owner, "Joined " + channel + ".")

            # Just to be silly, roll 1d10.  On a 1, say hello to the channel.
            roll = random.randint(1, 10)
            if roll == 1:
                pause = random.randint(1, 10)
                time.sleep(pause)
                logger.debug("Bot has randomly decided to announce itself.")
                connection.privmsg(
                    channel, "Hey, bro!  I'm " + self.nick +
                    ", the best cowboy who ever punched deck!")
        logger.debug("Exiting DixieBot.on_welcome().")

    # This method fires if the bot gets kicked from a channel.  The smart
    # thing to do is sleep for a random period of time (between one and three
    # minutes) before trying to join again.
    def on_kick(self, connection, event):
        delay = random.randint(60, 180)
        logger.debug("Got kicked from " + event.target + ".  Sleeping for " +
                     str(delay) + " seconds.")
        connection.privmsg(
            self.owner, "Got kicked from " + event.target +
            ".  Sleeping for " + str(delay) + " seconds.")
        time.sleep(delay)
        logger.debug("Rejoining channel " + event.target + ".")
        connection.privmsg(self.owner,
                           "Rejoining channel " + event.target + ".")
        connection.join(event.target)
        logger.info("Successfully re-joined channel " + event.target + ".")
        connection.privmsg(
            self.owner, "Successfully re-joined channel " + event.target + ".")
        return

    # This method fires if the bot gets kickbanned.
    def on_bannedfromchan(self, connection, event):
        logger.warn("Uh-oh - I got kickbanned from " + event.target +
                    ".  I know when I'm not wanted.")
        self.privmsg(
            self.owner, "Uh-oh - I got kickbanned from " + event.target +
            ".  I know when I'm not wanted.")
        self.joined_channels.remove(event.target)
        return

    # This method fires when the server disconnects the bot for some reason.
    # Ideally, the bot should try to connect again after a random number of
    # seconds.
    def on_disconnect(self, connection, event):
        delay = random.randint(60, 180)
        logger.warn("Connection dropped from server " + self.server +
                    ".  Sleeping for " + str(delay) + " seconds.")
        time.sleep(delay)
        logger.warn("Reconnecting to server " + self.server + " on port " +
                    str(self.port) + ".")
        try:
            irc.bot.SingleServerIRCBot.connect(self,
                                               [(self.server, self.port)],
                                               self.nick, self.nick)
            logger.info("Successfully reconnected to server " + self.server +
                        ".")
        except:
            logger.warn("Unable to reconnect to " + self.server +
                        ".  Something's really wrong.")

    # This method fires when the bot receives a private message.  For the
    # moment, if it's the bot's owner always learn from the text because this
    # is an ideal way to get more interesting stuff into the bot's brain.
    # It'll make a good place to look for and respond to specific commands,
    # too.
    def on_privmsg(self, connection, line):

        # IRC nick that sent a line to the bot in private chat.
        sending_nick = line.source.split("!~")[0]

        # Line of text sent from the channel or private message.
        irc_text = line.arguments[0]

        # String that holds what may or may not be a channel name.
        possible_channel_name = None

        # String that may or may not hold a respond to a channel in ghost mode.
        irc_response = None

        # Handle to an HTTP request object.
        http_connection = ""

        # JSON document containing responses from the conversation engine.
        json_response = {}

        # See if the owner is authenticating to the bot.
        if "!auth " in irc_text:
            self._authenticate(connection, sending_nick, irc_text)
            return

        # Handle messages from the bot's owner (if authenticated).
        if sending_nick == self.owner:
            if not self.authenticated:
                connection.privmsg(sending_nick, "You're not authenticated.")
                return

            # If the owner asks for online help, provide it.
            if irc_text == "!help" or irc_text == "!commands":
                self._help(connection, sending_nick)
                return

            # See if the owner is asking the bot to self-terminate.
            if irc_text == "!quit":
                logger.info("The bot's owner has told it to shut down.")
                connection.privmsg(sending_nick,
                                   "I get the hint.  Shuttin' down.")
                sys.exit(0)

            # See if the owner is asking for the bot's current configuration.
            if irc_text == "!config":
                self._current_config(connection, sending_nick)
                return

            # See if the owner is asking the bot to ping the conversation
            # engine's server.
            if irc_text == "!ping":
                self._ping(connection, sending_nick)
                return

            # See if the owner is asking the bot to change its nick.
            if "!nick" in irc_text:
                self._nick(connection, irc_text, sending_nick)
                return

            # See if the owner is asking the bot to join a channel.
            if "!join " in irc_text:
                self._join(connection, irc_text, sending_nick)
                return

            # See if the owner is flipping the self.respond flag.
            if "!respond" in irc_text:
                self._respond(connection, irc_text, sending_nick)
                return

            # See if the owner is asking for help on ghost mode.
            if "!ghosthelp" in irc_text:
                self._ghost_help(connection, sending_nick)
                return

            # See if the owner is flipping the self.ghost flag.
            if "!ghost" in irc_text:
                self._ghost_mode(connection, sending_nick)
                return

            # If the bot's in ghost mode, determine whether or not the bot's
            # owner has sent text destined for a channel the bot's sitting in.
            # If this is the case, send the channel the text sent by the
            # bot's owner.
            possible_channel_name = irc_text.split()[0]
            logger.debug("Value of possible_channel_name: " +
                         possible_channel_name)
            if self.ghost:
                if "#" in possible_channel_name:

                    # Test to see if the bot is in the channel in question.
                    in_channel = False
                    for channel in self.joined_channels:
                        if channel == possible_channel_name:
                            in_channel = True
                            break
                    if not in_channel:
                        logger.debug("Not in channel " +
                                     possible_channel_name + ".")
                        connection.privmsg(
                            sending_nick, "I'm not in channel " +
                            possible_channel_name + ".")
                        return
                    logger.debug("In channel " + possible_channel_name + ".")

                    # Send the text to the channel.
                    irc_response = " ".join(irc_text.split()[1:])
                    logger.debug("Value of irc_response: " + irc_response)
                    connection.privmsg(possible_channel_name, irc_response)

            # Always learn from private messages from the bot's owner.  Do not
            # respond to them if the bot's in ghost mode.  Determine whether
            # or not a #channelname is at the head of the text and if so
            # elide it by setting the line of text from the IRC channel to
            # the IRC response which already has the #channelname removed.
            if "#" in possible_channel_name:
                irc_text = " ".join(irc_text.split()[1:])
                logger.debug(
                    "Got a possible channel name.  Set value of irc_text to: "
                    + str(irc_text))

            # Train the bot on text sent by the bot's owner.
            json_response = json.loads(self._teach_brain(irc_text))
            if json_response['id'] != 200:
                logger.warn(
                    "DixieBot.on_privmsg(): Conversation engine returned error code "
                    + str(json_response['id']) + ".")

            # Don't get responses when in ghost mode.
            if self.ghost:
                return

            # Get a response for text sent by the bot's owner.
            json_response = json.loads(self._get_response(irc_text))
            if json_response['id'] != 200:
                logger.warn(
                    "DixieBot.on_privmsg(): Conversation engine returned error code "
                    + str(json_response['id']) + ".")
                return

            # Send the response text back to the bot's owner.
            connection.privmsg(sending_nick, json_response['response'])
            return
        else:
            logger.debug(
                "Somebody messaged me.  The content of the message was: " +
                irc_text)

    # Helper method for authenticating the bot's owner.
    def _authenticate(self, connection, nick, text):
        logger.warn("IRC user " + nick +
                    " is attempting to authenticate to the bot.")
        if self.password in text:
            connection.privmsg(nick,
                               "Authentication confirmed.  Welcome back.")
            self.owner = nick
            self.authenticated = True
            return
        else:
            connection.privmsg(nick, "Incorrect.")
            return

    # Helper method that implements online help.
    def _help(self, connection, nick):
        connection.privmsg(nick, "Here are the commands I support:")
        connection.privmsg(
            nick, "!help and !commands - You're reading them right now.")
        connection.privmsg(nick, "!quit - Shut me down.")
        connection.privmsg(
            nick, "!auth - Authenticate your current IRC nick as my admin.")
        connection.privmsg(nick, "!config - Send my current configuration.")
        connection.privmsg(
            nick,
            "!ping - Ping the conversation engine to make sure I can contact it."
        )
        connection.privmsg(nick,
                           "!nick <new nick> - Try to change my IRC nick.")
        connection.privmsg(nick, "!join <channel> - Join a channel.")
        connection.privmsg(
            nick, "!respond - Toggle respond/don't respond to users flag.")
        connection.privmsg(nick,
                           "!ghosthelp - Get online help for ghost mode.")
        connection.privmsg(
            nick,
            "!ghost - Whether or not the bot's registered owner can remotely interact with a channel the bot's a member of using the bot as a client."
        )
        return

    # Helper method that tells the bot's owner what the bot's current runtime
    # configuration is.
    def _current_config(self, connection, nick):
        connection.privmsg(nick, "Here's my current runtime configuration.")
        connection.privmsg(nick, "Channels I'm connected to: ")
        for channel in self.joined_channels:
            connection.privmsg(nick, "  " + channel)
        connection.privmsg(nick, "Current nick: " + self.nick)
        connection.privmsg(
            nick,
            "Canonical name (for interacting with the conversation engine): " +
            self.canonical_name)
        connection.privmsg(
            nick,
            "Server and port: " + self.server + " " + str(self.port) + "/tcp")
        if self.usessl:
            connection.privmsg(nick,
                               "My connection to the server is encrypted.")
        else:
            connection.privmsg(nick,
                               "My connection to the server isn't encrypted.")
        if self.respond:
            connection.privmsg(nick, "I respond to people talking to me.")
        else:
            connection.privmsg(nick,
                               "I don't respond to people talking to me.")
        if self.ghost:
            connection.privmsg(nick,
                               "I am monitoring IRC channels in ghost mode.")
        else:
            connection.privmsg(nick, "I am not in ghost mode.")
        return

    # Helper method that pings the bot's conversation engine.  I realize that
    # doing this is probably a little weird, but seeing as how I'm splitting
    # everything else out into helper methods to make adding functionality
    # later on easier I may as well.
    def _ping(self, connection, nick):
        connection.privmsg(nick, "Pinging the conversation engine...")
        http_connection = requests.get(self.engine + "/ping")
        if http_connection.text == "pong":
            connection.privmsg(nick, "I can hit the conversation engine.")
        else:
            connection.privmsg(
                nick,
                "I don't seem to be able to reach the conversation engine.")
        return

    # Helper method that will allow the bot to change its nick.
    def _nick(self, connection, text, nick):
        connection.privmsg(nick, "Trying to change my IRC nick...")
        self.nick = text.split()[1].strip()
        connection.nick(self.nick)
        logger.debug("New IRC nick: " + self.nick)
        connection.privmsg(nick, "Done.")
        return

    # Helper method that will allow the bot to join a channel.
    def _join(self, connection, text, nick):
        new_channel = text.split()[1].strip()
        connection.privmsg(nick, "Trying to join channel " + new_channel + ".")
        logger.debug("Trying to join channel " + new_channel + ".")
        connection.join(new_channel)
        self.joined_channels[new_channel] = 1
        connection.privmsg(nick, "Joined " + new_channel + ".")
        return

    # Helper method that flips the bot's mode from "respond when spoken to" to
    # don't respond when spoken to.
    def _respond(self, connection, text, nick):
        if self.respond == True:
            self.respond = False
            logger.info("Turn off the bot's auto-response mode.")
            connection.privmsg(nick,
                               "I won't respond to people talking to me.")
            return
        if self.respond == False:
            self.respond = True
            logger.info("Turn on the bot's auto-response mode.")
            connection.privmsg(nick, "Now responding to people talking to me.")
            return

    # Send the user online help for ghost mode.
    def _ghost_help(self, connection, nick):
        connection.privmsg(
            nick,
            "Ghost mode lets you interact with any channel I'm sitting in remotely so you don't have to join it."
        )
        connection.privmsg(
            nick,
            "This is ideal if you want to maintain a certain degree of stealth."
        )
        connection.privmsg(
            nick,
            "I can join the channel from one server and interact with everyone like a bot, and you can connect from another server without joining any channels, !auth to me, and communicate through me."
        )
        connection.privmsg(
            nick,
            "If I get rumbled, I get bounced and your disposable server can be banned, and all you have to do is get a copy of my conversation engine to preserve me.  You should be okay."
        )
        connection.privmsg(
            nick,
            "Please note that if you have me join a number of busy channels you may not be able to keep up with all the traffic, so choose the channels I join wisely.  Keep the number small for best results."
        )
        connection.privmsg(
            nick,
            "Put the name of the channel you want me to send text to at the front of a private message, like this:"
        )
        connection.privmsg(nick, "/msg botname")
        connection.privmsg(nick, "#somechannel Hello, world.")
        connection.privmsg(
            nick,
            "I will send activity in the channel back to you via the same privmsg as long as you're authenticated."
        )
        return

    # Flips the ghost mode flag.
    def _ghost_mode(self, connection, nick):
        if self.ghost == False:
            self.ghost = True
            logger.info("Ghost mode now activated.")
            connection.privmsg(nick, "Ghost mode activated.")
            connection.privmsg(
                nick,
                "You can now interact with the following channels through me: "
            )
            for channel in self.joined_channels:
                connection.privmsg(nick, "  " + channel)
            return
        if self.ghost == True:
            self.ghost = False
            logger.info("Ghost mode now deactivated.")
            connection.privmsg(nick, "Ghost mode deactivated.")
            return

    # This method fires every time a public message is posted to an IRC
    # channel.  Technically, 'line' should be 'event' but I'm just now getting
    # this module figured out...
    def on_pubmsg(self, connection, line):
        # JSON document from the conversation engine.
        json_response = {}

        # IRC nick that sent a line to the channel.
        sending_nick = line.source.split("!~")[0]
        logger.debug("Sending nick: " + sending_nick)

        # Line of text sent from the channel.
        irc_text = line.arguments[0]

        # If the line is from the bot's owner, learn from it and then decide
        # whether to respond or not.  Just in case somebody grabs the nick of
        # the bot's owner, don't respond if they're not authenticated (because
        # that could go real bad, real fast...)
        if sending_nick == self.owner and self.authenticated:

            # If the bot's owner addressed it directly, always respond.  Just
            # make sure to remove the bot's nick from the text to minimize
            # spurious entries in the bot's brain.
            asked_directly = irc_text.split(':')[0].strip()
            if asked_directly == self.nick:
                logger.debug(
                    "The bot's owner addressed the construct directly.  This is a special case."
                )

                # Extract the dialogue from the text in the IRC channel.
                dialogue_text = irc_text.split(':')[1].strip()

                # Send a request to train the conversation engine on the text.
                logger.debug("Training engine on text: " + dialogue_text)
                json_response = json.loads(self._teach_brain(dialogue_text))
                if json_response['id'] != int(200):
                    logger.warn(
                        "DixieBot.on_pubmsg(): Conversation engine returned error code "
                        + str(json_response['id']) + ".")
                    return

                # If the bot is in ghost mode, do not respond.
                if self.ghost:
                    return

                # Get a response to the text from the channel.
                json_response = json.loads(self._get_response(irc_text))
                if json_response['id'] != int(200):
                    logger.warn(
                        "DixieBot.on_pubmsg(): Conversation engine returned error code "
                        + str(json_response['id']) + ".")
                    return

                # Send the reply to the channel.
                connection.privmsg(line.target, json_response['response'])
                return

            # Otherwise, just learn from the bot's owner.
            json_response = json.loads(self._teach_brain(irc_text))
            if json_response['id'] != int(200):
                logger.warn(
                    "DixieBot.on_pubmsg(): Conversation engine returned error code "
                    + str(json_response['id']) + ".")
                return

            # Check the respond/don't respond flag.  If it's set to False,
            # don't say anything.
            if not self.respond:
                return

            # If the respond/don't respond flag it set to True, decide if the
            # bot is going to respond or not.  To be polite to people, only
            # respond 5% of the time.  10% was too much.
            roll = random.randint(1, 100)
            if roll <= 5:
                json_response = json.loads(self._get_response(irc_text))
                if json_response['id'] != int(200):
                    logger.warn(
                        "DixieBot.on_pubmsg(): Conversation engine returned error code "
                        + str(json_response['id']) + ".")
                    return

                # connection.privmsg() can be used to send text to either a
                # channel or a user.
                # Send the response.
                connection.privmsg(line.target, json_response['response'])
            return

        # If the line is not from the bot's owner, and the bot is in ghost
        # mode, relay the line to the bot's owner via privmsg.
        if self.ghost and self.authenticated:
            logger.debug("Relaying a line of text from " + line.target +
                         " to the bot's owner.")
            connection.privmsg(self.owner, line.target + ":: " + irc_text)

        # If the line is not from the bot's owner, decide randomly if the bot
        # should learn from it, or learn from and respond to it.  Respect the
        # respond/don't respond flag.
        roll = random.randint(1, 10)
        if roll == 1:
            logger.debug("Learning from the last line seen in the channel.")
            if self.wordfilter.blacklisted(irc_text):
                logger.warn("Wordfilter: Nope nope nope...")
                return
            json_response = json.loads(self._teach_brain(irc_text))
            if json_response['id'] != int(200):
                logger.warn(
                    "DixieBot.on_pubmsg(): Conversation engine returned error code "
                    + str(json_response['id']) + ".")
            return

        if roll == 2:
            logger.debug(
                "Learning from the last line seen in the channel.  I might respond to it."
            )
            if self.wordfilter.blacklisted(irc_text):
                logger.warn("Wordfilter: Nope nope nope...")
                return
            json_response = json.loads(self._teach_brain(irc_text))
            if json_response['id'] != int(200):
                logger.warn(
                    "DixieBot.on_pubmsg(): Conversation engine returned error code "
                    + str(json_response['id']) + ".")
                return

            # Check the respond/don't respond flag.  If it's set to False,
            # don't say anything.
            if not self.respond:
                return

            # Get and send a response.
            json_response = json.loads(self._get_response(irc_text))
            if json_response['id'] != int(200):
                logger.warn(
                    "DixieBot.on_pubmsg(): Conversation engine returned error code "
                    + str(json_response['id']) + ".")
                return
            connection.privmsg(line.target, json_response['response'])
            return

    # This method should fire when a client in the current channel emits a QUIT
    # event relayed by the server.  It detects the bot's owner disconnecting
    # and deauthenticates them.
    def on_quit(self, connection, event):
        sending_nick = event.source.split("!~")[0]
        if event.type == "quit" and sending_nick == self.owner and self.authenticated:
            logger.info("The bot's owner has disconnected.  Deauthenticating.")
            self.authenticated = False
            connection.privmsg(line.target, "Seeya, boss.")
            return

    # Sends text to train the conversation engine on.
    def _teach_brain(self, text):

        # Custom headers required by the conversation engine.
        headers = {"Content-Type": "application/json"}

        # HTTP request object handle.
        http_request = ""

        # JSON documents sent to and received from the conversation engine.
        json_request = {}
        json_request['botname'] = self.canonical_name
        json_request['apikey'] = self.api_key
        json_request['stimulus'] = text
        json_response = {}

        # Make an HTTP request to the conversation engine.
        http_request = requests.put(self.engine + "/learn",
                                    headers=headers,
                                    data=json.dumps(json_request))
        json_response = json.loads(http_request.content)
        return json_response

    # Gets a response from the conversation engine.  Return a response.
    def _get_response(self, text):

        # Custom headers required by the conversation engine.
        headers = {"Content-Type": "application/json"}

        # HTTP request object handle.
        http_request = ""

        # Response to send to the channel or user.
        response = ""

        # JSON documents sent to and received from the conversation engine.
        json_request = {}
        json_request['botname'] = self.canonical_name
        json_request['apikey'] = self.api_key
        json_request['stimulus'] = text
        json_response = {}

        # Contact the conversation engine to get a response.
        http_request = requests.get(self.engine + "/response",
                                    headers=headers,
                                    data=json.dumps(json_request))
        json_response = json.loads(http_request.content)
        return json_response

Пример #12

0

Показать файл

class TwitterMarkov(object):
    """
    Posts markov-generated text to twitter

    Args:
        screen_name (str): Twitter user account
        corpus (str): Text file to read to generate text.
        api (:ref:`tweepy.API <tweepy:tweepy.api>`): API to use to post tweets.
        dry_run (boolean): If set, TwitterMarkov won't actually post tweets.
        blacklist (Sequence): A list of words to avoid generating.
    """

    default_model = None
    _recently_tweeted = []

    def __init__(self, screen_name, corpus=None, **kwargs):
        if 'api' in kwargs:
            self.api = kwargs.pop('api')
        else:
            self.api = tbu.API(screen_name=screen_name, **kwargs)

        try:
            self.log = self.api.logger
        except AttributeError:
            self.log = logging.getLogger(screen_name)

        self.screen_name = screen_name
        self.config = self.api.config
        self.dry_run = kwargs.pop('dry_run', False)

        self.log.debug('screen name: %s', screen_name)
        self.log.debug("dry run: %s", self.dry_run)

        try:
            corpus = corpus or self.config.get('corpus')

            if isinstance(corpus, six.string_types):
                corpora = [corpus]

            elif isinstance(corpus, Iterable):
                corpora = corpus

            else:
                raise RuntimeError('Unable to find any corpora!')

            self.corpora = [b for b in corpora if b is not None]

            state_size = kwargs.get('state_size',
                                    self.config.get('state_size'))

            self.models = self._setup_models(self.corpora, state_size)

        except RuntimeError as e:
            self.log.error(e)
            raise e

        self.log.debug('models: %s', list(self.models.keys()))

        blacklist = kwargs.get('blacklist') or self.config.get('blacklist', [])
        self.wordfilter = Wordfilter()
        self.wordfilter.add_words(blacklist)

        self.log.debug('blacklist: %s terms', len(self.wordfilter.blacklist))

        if kwargs.get('learn', True):
            self.log.debug('learning...')
            self.learn_parent()

    def _setup_models(self, corpora, state_size):
        """
        Given a list of paths to corpus text files or file-like objects,
        set up markovify models for each.
        These models are returned in a dict, (with the basename as key).
        """
        out = dict()
        state_size = state_size or 2
        self.log.debug('setting up models (state_size=%s)', state_size)

        try:
            for pth in corpora:
                if isinstance(pth, six.string_types):
                    corpus_path = os.path.expanduser(pth)
                    name = os.path.basename(corpus_path)
                    m = open(corpus_path)

                else:
                    m = pth
                    try:
                        name = m.name
                    except AttributeError:
                        name = repr(m)

                try:
                    out[name] = markovify.text.NewlineText(
                        m.read(), state_size=state_size)

                finally:
                    m.close()

        except AttributeError as e:
            self.log.error(e)
            self.log.error("Probably couldn't find the model file.")
            raise e

        except IOError as e:
            self.log.error(e)
            self.log.error('Error reading %s', corpus_path)
            raise e

        self.default_model = os.path.basename(corpora[0])

        return out

    @property
    def recently_tweeted(self):
        '''Returns recent tweets from ``self.screen_name``.'''
        if not self._recently_tweeted:
            recent_tweets = self.api.user_timeline(self.screen_name,
                                                   count=self.config.get(
                                                       'checkback', 20))
            self._recently_tweeted = [x.text for x in recent_tweets]

        return self._recently_tweeted

    def check_tweet(self, text):
        '''Check if a string contains blacklisted words or is similar to a recent tweet.'''
        text = text.strip().lower()

        if not text:
            self.log.info("Rejected (empty)")
            return False

        if self.wordfilter.blacklisted(text):
            self.log.info("Rejected (blacklisted)")
            return False

        if tbu.helpers.length(text) > 280:
            self.log.info("Rejected (too long)")
            return False

        for line in self.recently_tweeted:
            if text in line.strip().lower():
                self.log.info("Rejected (Identical)")
                return False

            if Levenshtein.ratio(re.sub(r'\W+', '', text),
                                 re.sub(r'\W+', '',
                                        line.lower())) >= LEVENSHTEIN_LIMIT:
                self.log.info("Rejected (Levenshtein.ratio)")
                return False

        return True

    def reply_all(self, model=None, **kwargs):
        '''Reply to all mentions since the last time ``self.screen_name`` sent a reply tweet.'''
        mentions = self.api.mentions_timeline(since_id=self.api.last_reply)
        self.log.info('replying to all...')
        self.log.debug('mentions found: %d', len(mentions))

        if not self.dry_run:
            for status in mentions:
                self.reply(status, model, **kwargs)

    def reply(self, status, model=None, max_len=140, **kwargs):
        '''
        Compose a reply to the given ``tweepy.Status``.

        Args:
            status (tweepy.Status): status to reply to.
            model (str): name of model.
            max_len (int): maximum length of tweet (default: 140)
        '''
        self.log.debug('Replying to a mention')

        if status.user.screen_name == self.screen_name:
            self.log.debug('Not replying to self')
            return

        if self.wordfilter.blacklisted(status.text):
            self.log.debug(
                'Not replying to tweet with a blacklisted word (%d)',
                status.id)
            return

        text = self.compose(model,
                            max_len=max_len - 2 - len(status.user.screen_name),
                            **kwargs)
        reply = '@{} {}'.format(status.user.screen_name, text)

        self.log.info(reply)
        self._update(reply, in_reply=status.id_str)

    def tweet(self, model=None, **kwargs):
        '''
        Post a tweet composed by "model" (or the default model).
        Most of these arguments are passed on to Markovify.

        Args:
            model (str): one of self.models
            max_len (int): maximum length of the output (default: 140).
            init_state (tuple): tuple of words to seed the model
            tries (int): (default: 10)
            max_overlap_ratio (float): Used for testing output (default: 0.7).
            max_overlap_total (int): Used for testing output (default: 15)
        '''
        model = self.models[model or self.default_model]
        text = self.compose(model, **kwargs)
        if text:
            self._update(text)

    def _update(self, tweet, in_reply=None):
        if not self.dry_run:
            self.api.update_status(status=tweet,
                                   in_reply_to_status_id=in_reply)

    def compose(self, model=None, max_len=140, **kwargs):
        '''
        Returns a string generated from "model" (or the default model).
        Most of these arguments are passed on to Markovify.

        Args:
            model (str): one of self.models
            max_len (int): maximum length of the output (max: 280, default: 140).
            init_state (tuple): tuple of words to seed the model
            tries (int): (default: 10)
            max_overlap_ratio (float): Used for testing output (default: 0.7).
            max_overlap_total (int): Used for testing output (default: 15)

        Returns:
            str
        '''
        model = self.models.get(model or self.default_model)
        max_len = min(280, max_len)
        self.log.debug('making sentence, max_len=%s, %s', max_len, kwargs)
        text = model.make_short_sentence(max_len, **kwargs)

        if text is None:
            self.log.error('model failed to generate a sentence')
            raise RuntimeError('model failed to generate a sentence')

        # convert to unicode in Python 2
        if hasattr(text, 'decode'):
            text = text.decode('utf8')

        else:
            # Check tweet against blacklist and recent tweets
            if not self.check_tweet(text):
                # checked out: break and return
                text = self.compose(model=model, max_len=max_len, **kwargs)

        self.log.debug('TwitterMarkov: %s', text)

        return text

    def learn_parent(self, corpus=None, parent=None):
        '''
        Add recent tweets from the parent account (since the last time ``self.screen_name`` tweeted)
        to the corpus. This is subject to the filters described in ``bots.yaml``.
        '''
        parent = parent or self.config.get('parent')
        corpus = corpus or self.corpora[0]

        if not parent or not self.api.last_tweet:
            self.log.debug('Cannot teach: missing parent or tweets')
            return

        tweets = self.api.user_timeline(parent, since_id=self.api.last_tweet)

        try:
            gen = checking.generator(
                tweets,
                no_mentions=self.config.get('filter_mentions'),
                no_hashtags=self.config.get('filter_hashtags'),
                no_urls=self.config.get('filter_urls'),
                no_media=self.config.get('filter_media'),
                no_symbols=self.config.get('filter_symbols'),
                no_badwords=self.config.get('filter_parent_badwords', True),
                no_retweets=self.config.get('no_retweets'),
                no_replies=self.config.get('no_replies'))

            self.log.debug('%s is learning', corpus)

            with open(corpus, 'a') as f:
                f.writelines(tweet + '\n' for tweet in gen)

        except IOError as e:
            self.log.error('Learning failed for %s', corpus)
            self.log.error(e)

Пример #13

0

Показать файл

Файл: assaultbot.py Проект: theodric/assaultbot

    indexFile.write("0")
    indexFile.close()
    indexFile = open(indexDOTtxt, 'r')
    index = int(indexFile.read())
    indexFile.close()

## Open the text file containing our dictionary.
## We are taking the argument from the command line,
## but you can also hardcode your file here by following
## the above procedure for index.txt
wordFile = open(argFile, 'r')
words = wordFile.readlines()
wordFile.close()

## Set up Wordfilter. This is used to help us avoid auto-tweeting words that are not nice.
wordfilter = Wordfilter()
## I have a few extras that I specifically do not want my bot tweeting, so I add them to the filter here.
wordfilter.add_words(["rape", "rapist", "sex", "molest", "drug"])

##############################################################################
## Here we go.
## For each line in the words file, until we run out of lines, do some things:
for line in words:
    
    ## We check to see if the next word in queue is caught by Wordfilter...
    if wordfilter.blacklisted(str.upper(words[index].rstrip("\r\n"))):
        print("Yikes, " + str.upper(words[index].rstrip("\r\n")) + " might be problematic.\n We'll skip that one.")
        index = index + 1
    else: ## ...and if not, we continue on to tweet it.
        ## Print the word at the current index, make it UPPER CASE, and chomp() the trailing newline off of it.
        ## Remove the .upper method if that's not what you want

Пример #14

0

Показать файл

if os.path.exists(historyFilename):
    history = set(
        [s.strip() for s in codecs.open(historyFilename, 'r', 'utf-8')])
else:
    history = set()

if not testing:
    mastodon = Mastodon(client_id='clientcred.txt', api_base_url=mastodonUrl)
    mastodon.log_in(open('email.txt').read().strip(),
                    open('password.txt').read().strip(),
                    scopes=['write'])

publicStatusCycle = 0

wordfilter = Wordfilter()

#mashapeKey = open('mashapekey.txt').read().strip()


def domainrStatus(domain):
    params = {'mashape-key': mashapeKey, 'domain': domain}
    url = domainrEndpoint + 'status?' + urllib.urlencode(params)
    urldoc = urllib.urlopen(url)
    result = json.load(urldoc)
    urldoc.close()
    return result['status'][0]['status'].split()


while True:
    random.shuffle(words)

Пример #15

0

Показать файл

Файл: nondescript.py Проект: robincamille/nondescript2

# Outputs two texts:
# changewords(text)[0] has suggestions for replacing words (human-directed).
# changewords(text)[1] has randomly replaced words (automatic).

# New addition: sentences are split with __ as delimiter for textprint (editable area)

from random import randint
from nltk.corpus import wordnet as wn
from nltk import word_tokenize as tok
from wordfilter import Wordfilter

wf = Wordfilter()  #https://github.com/dariusk/wordfilter
#Words that may be offensive or that have undesirable results in WordNet:
ignore = [
    'will', 'more', 'must', 'there', 'john', 'screw', 'queer', 'crap', 'shit',
    'ass', 'sex', 'f**k', 'f****r', 'm**********r', 'f***s', 'f****d',
    'f*****g'
]

with open('data/top1000.txt') as vocdoc:
    topwords = [w[:-1] for w in vocdoc.readlines()]


def changewords(text):
    """Returns two texts [T1, T2]: T1 text with certain words (in all caps) followed by
potential synonyms in parentheses, T2 text with randomly-chosen synonyms in all caps
that replace certain words."""
    i = 0
    text = text.split()
    #text = tok(text) - more accurate, but difficult to join below
    textprint = []  #Text will appear as so: she SHOUTED (shout out, call...

Пример #16

0

Показать файл

Файл: main.py Проект: umidjon-userbot/prog

async def ytplay(requested_by, query, message):
    global playing
    ydl_opts = {"format": "bestaudio"}
    #n = await send(f"__**Searching for {query} on YouTube.**__")     
    m = await message.reply_text(
        f"__**Searching for {query} on YouTube.**__", quote=False
    )
    try:
        results = await arq.youtube(query)
        if not results.ok:
            await message.reply_text(results.result)
            return
        results = results.result
        link = f"https://youtube.com{results[0].url_suffix}"
        title = results[0].title
        thumbnail = results[0].thumbnails[0]
        duration = results[0].duration
        views = results[0].views
         
         
         
         
        songname = title.lower()
        detecting = detect(songname)
         
        wordfilter = Wordfilter()
        wordfilter.addWords(['yamete', 'kudasai', 'arigato', 'hentai'])     
        if wordfilter.blacklisted(songname): 
           await m.edit(f"__**Shame on you ! {requested_by}\nNot allowed song !!!**__\n@wuminjun block him!\n{songname}")  
           playing = False
           return
        if detecting == "ko":
           await m.edit(f"__**Not allowed Language !!!**__ {songname}")  
           playing = False
           return
         
         
         
        if time_to_seconds(duration) >= 3600:
            await m.edit("__**Bruh! Only songs within 60 Mins.**__")
            playing = False
            return
    except Exception as e:
        await m.edit("__**Found No Song Matching Your Query.**__")
        playing = False
        print(str(e))
        return
    await m.edit("__**Processing Thumbnail.**__")
    await app.update_profile(first_name=f"🔉{title[:35]} ",bio = f"__{title[:35]}__ ijro etilmoqda")     
    await generate_cover(requested_by, title, views, duration, thumbnail)
    await m.edit("__**Downloading Music.**__")
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        info_dict = ydl.extract_info(link, download=False)
        audio_file = ydl.prepare_filename(info_dict)
        ydl.process_info(info_dict)
    await m.edit("__**Transcoding.**__")
    os.rename(audio_file, "audio.webm")
    transcode("audio.webm")
    await m.delete()
       
    caption = f"🏷 **Name:** [{title}]({link})\n⏳ **Duration:** {duration}\n" \
               + f"🎧 **Requested By:** {requested_by}\n📡 **Platform:** YouTube"
    m = await message.reply_photo(
        photo="final.png",
        caption=caption,
    )
    msg_id = m.message_id     
    if message.chat.username != "music_streaming_channel":     
       copy = await app.copy_message(SUDO_CHANNEL, message.chat.username, msg_id)     
   
    await app.set_profile_photo(photo="final.png")     
    #await app.pin_chat_message(SUDO_CHAT_ID, msg_id, disable_notification=True) 
         
    os.remove("final.png")
    await asyncio.sleep(int(time_to_seconds(duration)))
    await m.delete()
    await copy.delete()     
     
    photos = await app.get_profile_photos("me")     
    await app.delete_profile_photos([p.file_id for p in photos[1:]])     
    playing = False

Пример #17

0

Показать файл

class BibleMungingServer(object):
    def __init__(
            self,
            bible: biblemunger.Bible,
            favdict,  #: list[dict],
            apptitle: str,
            appsubtitle: str,
            dbpath: str,
            wordfilter: bool):

        self.bible = bible
        self.apptitle = apptitle
        self.appsubtitle = appsubtitle
        self.dbpath = dbpath

        if wordfilter:
            from wordfilter import Wordfilter
            self.wordfilter = Wordfilter()
            self.wordfilter.add_words(['QwertyStringUsedForTestingZxcvb'])
        else:
            self.wordfilter = False

        deploymentinfofile = os.path.join(scriptdir, 'deploymentinfo.txt')
        if os.path.exists(deploymentinfofile):
            with open(deploymentinfofile) as df:
                self.deploymentinfo = df.read()
        else:
            self.deploymentinfo = "development version"

        # TODO: refactor this, just use a dictionary directly elsewhere
        self.favorite_searches = []
        for key in favdict.keys():
            self.favorite_searches += [{
                'search': key,
                'replace': favdict[key]
            }]

        conn = sqlite3.connect(self.dbpath)
        c = conn.cursor()
        c.execute(
            "select name from sqlite_master where type='table' and name='recent_searches'"
        )
        if not c.fetchone():
            self.initialize_database()

    @classmethod
    def fromconfig(cls, configuration: configparser.ConfigParser):
        return BibleMungingServer(
            biblemunger.Bible(configuration.get('biblemunger', 'bible')),
            configuration['favorites'],
            configuration.get('biblemunger', 'apptitle'),
            configuration.get('biblemunger', 'appsubtitle'),
            configuration.get('bmweb', 'dbpath'),
            configuration.getboolean('bmweb', 'wordfilter'))

    def search_in_list(self, searchlist, search, replace):
        for s in searchlist:
            if s['search'] == search and s['replace'] == replace:
                return True
        else:
            return False

    @property
    def recent_searches(self):
        conn = sqlite3.connect(self.dbpath)
        c = conn.cursor()
        c.execute("select search, replace from recent_searches")
        results = c.fetchall()
        conn.close()
        searches = []
        for r in results:
            searches += [{'search': r[0], 'replace': r[1]}]
        return searches

    def initialize_database(self):
        conn = sqlite3.connect(self.dbpath)
        c = conn.cursor()
        c.execute('''create table recent_searches (search, replace)''')
        conn.commit()
        conn.close()

    def add_recent_search(self, search, replace):
        in_faves = self.search_in_list(self.favorite_searches, search, replace)
        in_recent = self.search_in_list(self.recent_searches, search, replace)

        if self.wordfilter:
            filtered = self.wordfilter.blacklisted(replace)
        else:
            filtered = False

        if (in_faves or in_recent or filtered):
            return

        conn = sqlite3.connect(self.dbpath)
        c = conn.cursor()
        c.execute("insert into recent_searches values (?, ?)",
                  (search, replace))
        conn.commit()
        conn.close()

    @cherrypy.expose
    @cherrypy.tools.mako(filename='index.mako')
    def index(self, search=None, replace=None):
        pagetitle = self.apptitle
        queried = False
        resultstitle = None
        results = None
        sampleresult = None

        if search and replace:
            #resultstitle = "{} &rArr; {}".format(search, replace)
            resultstitle = "{} ⇒ {}".format(search, replace)
            pagetitle = "{}: {}".format(self.apptitle, resultstitle)
            queried = True
            results = self.bible.replace(search, replace)
            if results:
                self.add_recent_search(search, replace)

        return {
            'pagetitle': pagetitle,
            'apptitle': self.apptitle,
            'appsubtitle': self.appsubtitle,
            'queried': queried,
            'resultstitle': resultstitle,
            'results': results,
            'favorites': self.favorite_searches,
            'recents': self.recent_searches,
            'search': search,
            'replace': replace,
            'deploymentinfo': self.deploymentinfo,
            'filterinuse': bool(self.wordfilter)
        }

Пример #18

0

Показать файл

def loadlines(filename='poetry.json-stream.gz',
              startidx=0,
              count=None,
              modulo=1):
    """Yields successive dictionaries from my Gutenberg Poetry corpus gzip.

    Lines are returned as dictionaries with keys for the Gutenberg ID of the
    text containing the line of poetry and the line itself. Optional startidx 
    and count parameters allow you to load only a subset of lines (starting at
    one index and collecting until the count is reached); a modulo parameter,
    if specified, will only yield the line if its index is divisible by the
    modulo. (This is a simple proxy for getting a "sampling" of lines.)

    >>> for line in loadlines(startidx=100, count=5):
    ...     print(line['line'])
    By the alders in the Summer,
    By the white fog in the Autumn,
    By the black line in the Winter;
    And beside them dwelt the singer,
    In the green and silent valley.

    >>> for line in loadlines(modulo=250000):
    ...     print(line['gutenberg_id'])
    617
    6130
    9567
    10161
    12137
    13561
    16209
    18466
    20174
    22692
    25599
    28621
    30720
    36508

    """
    wordfilter = Wordfilter()
    already_seen = set()
    for i, line in enumerate(gzip.open(filename, mode='rt')):
        if i < startidx:
            continue
        if count is not None and i > startidx + count:
            break
        if i % modulo != 0:
            continue
        # load the data and decode
        line = json.loads(line)

        if wordfilter.blacklisted(line['line']):
            continue

        # disqualifying characteristics (looks like a title, has brackets)
        if isprobablytitle(line['line']): continue
        if '[' in line['line'] or ']' in line['line']: continue
        if re.search(r"^\d", line['line']): continue

        # parse into words
        words = tuple([x.lower() for x in tokens(line['line'])])

        # no short lines, as they're not very interesting
        if len(words) <= 2:
            continue

        # skip if we've already seen something like this
        if words in already_seen:
            continue
        already_seen.add(words)

        yield line

Пример #19

0

Показать файл

Файл: twitter_markov.py Проект: fitnr/twitter_markov

class TwitterMarkov(object):
    """
    Posts markov-generated text to twitter

    Args:
        screen_name (str): Twitter user account
        corpus (str): Text file to read to generate text.
        api (:ref:`tweepy.API <tweepy:tweepy.api>`): API to use to post tweets.
        dry_run (boolean): If set, TwitterMarkov won't actually post tweets.
        blacklist (Sequence): A list of words to avoid generating.
    """

    default_model = None
    _recently_tweeted = []

    def __init__(self, screen_name, corpus=None, **kwargs):
        if 'api' in kwargs:
            self.api = kwargs.pop('api')
        else:
            self.api = tbu.API(screen_name=screen_name, **kwargs)

        try:
            self.log = self.api.logger
        except AttributeError:
            self.log = logging.getLogger(screen_name)

        self.screen_name = screen_name
        self.config = self.api.config
        self.dry_run = kwargs.pop('dry_run', False)

        self.log.debug('screen name: %s', screen_name)
        self.log.debug("dry run: %s", self.dry_run)

        try:
            corpus = corpus or self.config.get('corpus')

            if isinstance(corpus, six.string_types):
                corpora = [corpus]

            elif isinstance(corpus, Iterable):
                corpora = corpus

            else:
                raise RuntimeError('Unable to find any corpora!')

            self.corpora = [b for b in corpora if b is not None]

            state_size = kwargs.get('state_size', self.config.get('state_size'))

            self.models = self._setup_models(self.corpora, state_size)

        except RuntimeError as e:
            self.log.error(e)
            raise e

        self.log.debug('models: %s', list(self.models.keys()))

        blacklist = kwargs.get('blacklist') or self.config.get('blacklist', [])
        self.wordfilter = Wordfilter()
        self.wordfilter.add_words(blacklist)

        self.log.debug('blacklist: %s terms', len(self.wordfilter.blacklist))

        if kwargs.get('learn', True):
            self.log.debug('learning...')
            self.learn_parent()

    def _setup_models(self, corpora, state_size):
        """
        Given a list of paths to corpus text files or file-like objects,
        set up markovify models for each.
        These models are returned in a dict, (with the basename as key).
        """
        out = dict()
        state_size = state_size or 2
        self.log.debug('setting up models (state_size=%s)', state_size)

        try:
            for pth in corpora:
                if isinstance(pth, six.string_types):
                    corpus_path = os.path.expanduser(pth)
                    name = os.path.basename(corpus_path)
                    m = open(corpus_path)

                else:
                    m = pth
                    try:
                        name = m.name
                    except AttributeError:
                        name = repr(m)

                try:
                    out[name] = markovify.text.NewlineText(m.read(), state_size=state_size)

                finally:
                    m.close()

        except AttributeError as e:
            self.log.error(e)
            self.log.error("Probably couldn't find the model file.")
            raise e

        except IOError as e:
            self.log.error(e)
            self.log.error('Error reading %s', corpus_path)
            raise e

        self.default_model = os.path.basename(corpora[0])

        return out

    @property
    def recently_tweeted(self):
        '''Returns recent tweets from ``self.screen_name``.'''
        if not self._recently_tweeted:
            recent_tweets = self.api.user_timeline(self.screen_name, count=self.config.get('checkback', 20))
            self._recently_tweeted = [x.text for x in recent_tweets]

        return self._recently_tweeted

    def check_tweet(self, text):
        '''Check if a string contains blacklisted words or is similar to a recent tweet.'''
        text = text.strip().lower()

        if not text:
            self.log.info("Rejected (empty)")
            return False

        if self.wordfilter.blacklisted(text):
            self.log.info("Rejected (blacklisted)")
            return False

        if tbu.helpers.length(text) > 280:
            self.log.info("Rejected (too long)")
            return False

        for line in self.recently_tweeted:
            if text in line.strip().lower():
                self.log.info("Rejected (Identical)")
                return False

            if Levenshtein.ratio(re.sub(r'\W+', '', text), re.sub(r'\W+', '', line.lower())) >= LEVENSHTEIN_LIMIT:
                self.log.info("Rejected (Levenshtein.ratio)")
                return False

        return True

    def reply_all(self, model=None, **kwargs):
        '''Reply to all mentions since the last time ``self.screen_name`` sent a reply tweet.'''
        mentions = self.api.mentions_timeline(since_id=self.api.last_reply)
        self.log.info('replying to all...')
        self.log.debug('mentions found: %d', len(mentions))

        if not self.dry_run:
            for status in mentions:
                self.reply(status, model, **kwargs)

    def reply(self, status, model=None, max_len=140, **kwargs):
        '''
        Compose a reply to the given ``tweepy.Status``.

        Args:
            status (tweepy.Status): status to reply to.
            model (str): name of model.
            max_len (int): maximum length of tweet (default: 140)
        '''
        self.log.debug('Replying to a mention')

        if status.user.screen_name == self.screen_name:
            self.log.debug('Not replying to self')
            return

        if self.wordfilter.blacklisted(status.text):
            self.log.debug('Not replying to tweet with a blacklisted word (%d)', status.id)
            return

        text = self.compose(model, max_len=max_len - 2 - len(status.user.screen_name), **kwargs)
        reply = '@{} {}'.format(status.user.screen_name, text)

        self.log.info(reply)
        self._update(reply, in_reply=status.id_str)

    def tweet(self, model=None, **kwargs):
        '''
        Post a tweet composed by "model" (or the default model).
        Most of these arguments are passed on to Markovify.

        Args:
            model (str): one of self.models
            max_len (int): maximum length of the output (default: 140).
            init_state (tuple): tuple of words to seed the model
            tries (int): (default: 10)
            max_overlap_ratio (float): Used for testing output (default: 0.7).
            max_overlap_total (int): Used for testing output (default: 15)
        '''
        model = self.models[model or self.default_model]
        text = self.compose(model, **kwargs)
        if text:
            self._update(text)

    def _update(self, tweet, in_reply=None):
        if not self.dry_run:
            self.api.update_status(status=tweet, in_reply_to_status_id=in_reply)

    def compose(self, model=None, max_len=140, **kwargs):
        '''
        Returns a string generated from "model" (or the default model).
        Most of these arguments are passed on to Markovify.

        Args:
            model (str): one of self.models
            max_len (int): maximum length of the output (max: 280, default: 140).
            init_state (tuple): tuple of words to seed the model
            tries (int): (default: 10)
            max_overlap_ratio (float): Used for testing output (default: 0.7).
            max_overlap_total (int): Used for testing output (default: 15)

        Returns:
            str
        '''
        model = self.models.get(model or self.default_model)
        max_len = min(280, max_len)
        self.log.debug('making sentence, max_len=%s, %s', max_len, kwargs)
        text = model.make_short_sentence(max_len, **kwargs)

        if text is None:
            self.log.error('model failed to generate a sentence')
            raise RuntimeError('model failed to generate a sentence')

        # convert to unicode in Python 2
        if hasattr(text, 'decode'):
            text = text.decode('utf8')

        else:
            # Check tweet against blacklist and recent tweets
            if not self.check_tweet(text):
                # checked out: break and return
                text = self.compose(model=model, max_len=max_len, **kwargs)

        self.log.debug('TwitterMarkov: %s', text)

        return text

    def learn_parent(self, corpus=None, parent=None):
        '''
        Add recent tweets from the parent account (since the last time ``self.screen_name`` tweeted)
        to the corpus. This is subject to the filters described in ``bots.yaml``.
        '''
        parent = parent or self.config.get('parent')
        corpus = corpus or self.corpora[0]

        if not parent or not self.api.last_tweet:
            self.log.debug('Cannot teach: missing parent or tweets')
            return

        tweets = self.api.user_timeline(parent, since_id=self.api.last_tweet)

        try:
            gen = checking.generator(tweets,
                                     no_mentions=self.config.get('filter_mentions'),
                                     no_hashtags=self.config.get('filter_hashtags'),
                                     no_urls=self.config.get('filter_urls'),
                                     no_media=self.config.get('filter_media'),
                                     no_symbols=self.config.get('filter_symbols'),
                                     no_badwords=self.config.get('filter_parent_badwords', True),
                                     no_retweets=self.config.get('no_retweets'),
                                     no_replies=self.config.get('no_replies')
                                    )

            self.log.debug('%s is learning', corpus)

            with open(corpus, 'a') as f:
                f.writelines(tweet + '\n' for tweet in gen)

        except IOError as e:
            self.log.error('Learning failed for %s', corpus)
            self.log.error(e)

Пример #20

0

Показать файл

Файл: common.py Проект: embolalia/picdescbot

from __future__ import unicode_literals, absolute_import, print_function

from wordfilter import Wordfilter
import json
import re
import requests
import time
from io import BytesIO

MEDIAWIKI_API = "https://commons.wikimedia.org/w/api.php"
CVAPI = "https://api.projectoxford.ai/vision/v1.0/analyze"

HEADERS = {"User-Agent":  "picdescbot, http://github.com/elad661/picdescbot"}

supported_formats = re.compile('\.(png|jpe?g|gif)$', re.I)
word_filter = Wordfilter()

# I really don't want the bot to show this kind of imagery!
word_filter.add_words(['nazi', 'hitler'])

# Blacklist some categories, just in case. These are matched on a substring
# basis, against the page's categories and the titles of the wikipages using
# the picture.
category_blacklist = ['september 11', 'hitler', 'nazi', 'antisemit', 'libel',
                      'apartheid', 'racism', 'lynching', 'cartoons',
                      'holocaust', 'stereotypes', 'flags', 'p**n',
                      'homophobia', 'transpobia', 'logos']

# Gender neutralization helps prevent accidental transphobic juxtapositions
# which can occur when CVAPI uses gendered words in the description, but their
# gender detection is wrong. Computers shouldn't try to detect gender, and

Пример #21

0

Показать файл

Файл: bot.py Проект: Salavin/band-bot

MUTE_TIME = 14
COOLDOWN = 2
BAND_SERVER = 743519350501277716
TEST_SERVER = 746851271901708428
MESSAGES_CHANNEL = 784197374959943731

weatherUrl = config.weatherUrl
forecastUrl = config.forecastUrl
mtUrl = config.mtUrl
timeFormat = "%A %I:%M%p"
intents = discord.Intents.default()
intents.members = True
intents.reactions = True
client = commands.Bot(command_prefix='!', intents=intents, help_command=None)
client.agreeCounter = 0
wordfilter = Wordfilter()
wordfilter.clearList()
wordfilter.addWords(config.banned_words)
client.last_response_time = datetime.now() - timedelta(minutes=COOLDOWN + 1)
client.mutedTime = datetime.now() - timedelta(minutes=MUTE_TIME + 1)
client.prev_dm_user = None


class GameDay:
    def __init__(self, opponent, date):
        self.opponent = opponent
        self.date = date


gamedays = {
    1: GameDay('University of Northern Iowa', datetime(2021, 9, 4)),

Пример #22

0

Показать файл

Файл: twitter_markov.py Проект: celesteh/twitter_markov

class TwitterMarkov(object):

    """Posts markov-generated text to twitter"""

    default_model = None
    _recently_tweeted = []
    last_tweet = None

    def __init__(self, screen_name, corpus=None, **kwargs):
        '''
        :screen_name User name to post as
        :corpus Text file to read to generate text.
        :api tweepy.API object
        :dry_run boolean If set, TwitterMarkov won't actually post tweets.
        '''
        if 'api' in kwargs:
            self.api = kwargs.pop('api')
        else:
            self.api = tbu.API(screen_name=screen_name, **kwargs)

        try:
            self.log = self.api.logger
        except AttributeError:
            self.log = logging.getLogger(screen_name)

        self.screen_name = screen_name
        self.config = self.api.config

        self.dry_run = kwargs.pop('dry_run', False)

        try:
            corpus = corpus or self.config.get('corpus')

            if isinstance(corpus, basestring):
                corpora = [corpus]

            elif isinstance(corpus, Iterable):
                corpora = corpus

            else:
                raise RuntimeError('Unable to find any corpora!')

            self.corpora = [b for b in corpora if b is not None]

            self.log.debug('%s, %s', screen_name, self.corpora)

            state_size = kwargs.get('state_size', self.config.get('state_size'))

            self.models = self._setup_models(self.corpora, state_size)

        except RuntimeError as e:
            self.log.error(e)
            raise e

        self.log.debug('models: %s', list(self.models.keys()))

        blacklist = kwargs.get('blacklist') or self.config.get('blacklist', [])
        self.wordfilter = Wordfilter()
        self.wordfilter.add_words(blacklist)

        self.last_tweet = (self.api.user_timeline(count=1))[0]
        self.last_tweet = self.last_tweet.id

        if kwargs.get('learn', True):
            self.learn_parent()

    def _setup_models(self, corpora, state_size):
        """
        Given a list of paths to corpus text files, set up markovify models for each.
        These models are returned in a dict, (with the basename as key).
        """
        self.log.debug('setting up models')
        out = dict()

        state_size = state_size or 3

        try:
            for pth in corpora:
                corpus_path = os.path.expanduser(pth)
                name = os.path.basename(corpus_path)

                with open(corpus_path) as m:
                    out[name] = markovify.text.NewlineText(m.read(), state_size=state_size)

        except AttributeError as e:
            self.log.error(e)
            self.log.error("Probably couldn't find the model file.")
            raise e

        except IOError as e:
            self.log.error(e)
            self.log.error('Error reading %s', corpus_path)
            raise e

        self.default_model = os.path.basename(corpora[0])

        return out

    @property
    def recently_tweeted(self):
        if len(self._recently_tweeted) == 0:
            #recent_tweets = self.api.user_timeline(self.screen_name, count=self.config.get('checkback', 20))
            recent_tweets = self.api.user_timeline()
            self._recently_tweeted = [x.text for x in recent_tweets]

        return self._recently_tweeted



    def check_tweet(self, text):
        text = text.strip().lower()

        if len(text) == 0:
            self.log.info("Rejected (empty)")
            return False

        if self.wordfilter.blacklisted(text):
            self.log.info("Rejected (blacklisted)")
            self.log.info(text)
            return False

        for line in self.recently_tweeted:
            if text in line.strip().lower():
                self.log.info("Rejected (Identical)")
                return False

            if Levenshtein.ratio(re.sub(r'\W+', '', text), re.sub(r'\W+', '', line.lower())) >= LEVENSHTEIN_LIMIT:
                self.log.info("Rejected (Levenshtein.ratio)")
                return False

        return True

    def reply_all(self, model=None, **kwargs):
        mentions = self.api.mentions_timeline(since_id=self.api.last_reply)
        self.log.info('%replying to all...')
        self.log.debug('%s mentions found', len(mentions))

        for status in mentions:
            self.reply(status, model, **kwargs)

    def reply(self, status, model=None, **kwargs):
        self.log.debug('Replying to a mention')

        if status.user.screen_name == self.screen_name:
            self.log.debug('Not replying to self')
            return

        text = self.compose(model, max_len=138 - len(status.user.screen_name), **kwargs)

        reply = '@' + status.user.screen_name + ' ' + text

        self.log.info(reply)
        self._update(reply, in_reply=status.id_str)

    def tweet(self, model=None, **kwargs):
        text = self.compose(model, **kwargs)

        self.log.info(text)
        self._update(text)

    def _update(self, tweet, in_reply=None):
        if not self.dry_run:
            self.api.update_status(status=tweet, in_reply_to_status_id=in_reply)

    def compose(self, model=None, max_len=None, **kwargs):
        '''Format a tweet with a reply.'''

        max_len = min(140, (max_len or self.config.get('tweet_size')))
        model = self.models[model or self.default_model]

        eols = '.!?'#'.?!/:;,'
        text = ''

        while True:
            sent = model.make_sentence(**kwargs)

            if not sent:
                continue

            # convert to unicode in Python 2
            if hasattr(sent, 'decode'):
                sent = sent.decode('utf8')

            # Add eol delimiter if one is missing
            if sent[-1] not in eols and (sent[-2] not in eols and sent[-1] not in u'"\'’”〞❞'):
                sent = sent + choice('?..!!!') #'.!?'

            if len(text) + len(sent) < max_len - 1:
                text = (text + ' ' + sent).strip()

            else:
                # Check tweet against blacklist and recent tweets
                if self.check_tweet(text):
                    # checked out: break and return
                    break
                else:
                    # didn't check out, start over
                    text = ''

        self.log.debug('TwitterMarkov: %s', text)

        return text

    def learn_peer(self, corpus=None, peer=None):
        '''Add recent tweets from peers to corpus'''
        peer = peer or self.config.get('peer')
        corpus = corpus or self.corpora[0]

        if not peer:
            self.log.debug('Cannot teach: missing parent or tweets')
            return

        tweets = self.api.home_timeline(count=150, since_id=self.last_tweet)
        #self.api.user_timeline(parent, since_id=self.api.last_tweet)
        #print(tweets[0])

        try:
            gen = checking.generator(tweets,
                                     no_mentions=self.config.get('filter_mentions'),
                                     no_hashtags=self.config.get('filter_hashtags'),
                                     no_urls=self.config.get('filter_urls'),
                                     no_media=self.config.get('filter_media'),
                                     no_symbols=self.config.get('filter_symbols'),
                                     no_badwords=self.config.get('filter_parent_badwords', True),
                                     no_retweets=self.config.get('no_retweets'),
                                     no_replies=self.config.get('no_replies')
                                    )

            #print str(gen)
            #print 'foo'
            #self.log.error(gen.next())

            self.log.debug('%s is learning', corpus)

            with open(corpus, 'a') as f:
                for tweet in gen:
                    try:
                        #utweet = unicode(tweet, "utf-8")
                        #f.write(str(tweet)+'\n')
                        utweet = unicodedata.normalize('NFKD', tweet).encode('ascii','ignore')
                        utweet = re.sub('\s+', ' ', utweet)
                        f.write(utweet+'\n')
                    except UnicodeEncodeError as e:
                        self.log.error(tweet)
                #f.writelines(tweet+ '\n' for tweet in gen)

        except IOError as e:
            self.log.error('Learning failed for %s', corpus)
            self.log.error(e)


    def learn_search(self, search=None, corpus=None):
        '''Add recent tweets from search to corpus'''
        #search = 'nuclear war'
        search = search or self.config.get('search')
        corpus = corpus or self.corpora[0]

        if not search:
            self.log.debug('Cannot teach: missing search or tweets')
            return

        #tweets = self.api.home_timeline(count=150)
        #self.api.user_timeline(parent, since_id=self.api.last_tweet)
        #print(tweets[0])
        tweets =  self.api.search(search, since_id=self.last_tweet)

        try:
            gen = checking.generator(tweets,
                                     no_mentions=self.config.get('filter_mentions'),
                                     no_hashtags=self.config.get('filter_hashtags'),
                                     no_urls=self.config.get('filter_urls'),
                                     no_media=self.config.get('filter_media'),
                                     no_symbols=self.config.get('filter_symbols'),
                                     no_badwords=self.config.get('filter_parent_badwords', True),
                                     no_retweets=self.config.get('no_retweets'),
                                     no_replies=self.config.get('no_replies')
                                    )

            #print str(gen)
            #print 'foo'
            #self.log.error(gen.next())

            self.log.debug('%s is learning', corpus)

            with open(corpus, 'a') as f:
                for tweet in gen:
                    try:
                        #utweet = unicode(tweet, "utf-8")
                        #f.write(str(tweet)+'\n')
                        utweet = unicodedata.normalize('NFKD', tweet).encode('ascii','ignore')
                        utweet = re.sub('\s+', ' ', utweet)
                        f.write(utweet+'\n')
                    except UnicodeEncodeError as e:
                        self.log.error(tweet)
                #f.writelines(tweet+ '\n' for tweet in gen)

        except IOError as e:
            self.log.error('Learning failed for %s', corpus)
            self.log.error(e)


    def learn_parent(self, corpus=None, parent=None):
        '''Add recent tweets from @parent to corpus'''
        parent = parent or self.config.get('parent')
        corpus = corpus or self.corpora[0]

        if not parent or not last_tweet:
            self.log.debug('Cannot teach: missing parent or tweets')
            return

        tweets = self.api.user_timeline(parent, since_id=self.last_tweet)

        try:
            gen = checking.generator(tweets,
                                     no_mentions=self.config.get('filter_mentions'),
                                     no_hashtags=self.config.get('filter_hashtags'),
                                     no_urls=self.config.get('filter_urls'),
                                     no_media=self.config.get('filter_media'),
                                     no_symbols=self.config.get('filter_symbols'),
                                     no_badwords=self.config.get('filter_parent_badwords', True),
                                     no_retweets=self.config.get('no_retweets'),
                                     no_replies=self.config.get('no_replies')
                                    )

            self.log.debug('%s is learning', corpus)

            with open(corpus, 'a') as f:
                f.writelines(tweet + '\n' for tweet in gen)

        except IOError as e:
            self.log.error('Learning failed for %s', corpus)
            self.log.error(e)

Пример #23

0

Показать файл

Файл: spacy_nlp.py Проект: samhains/scraper_tools

import spacy
import random
import annoy
import string
from wordfilter import Wordfilter
from itertools import islice

from spacy.lang.en.stop_words import STOP_WORDS as stop_words
wf = Wordfilter()


def prepare_nlp():
    nlp = spacy.load('en_core_web_md') # or en_core_web_md
    qualified = [item for item in nlp.vocab if item.has_vector and item.is_alpha]

    lexmap = []
    t = annoy.AnnoyIndex(300)
    for i, item in enumerate(islice(sorted(qualified, key=lambda x: x.prob, reverse=True), 100000)):
        t.add_item(i, item.vector)
        lexmap.append(item)
    t.build(25)

    p = annoy.AnnoyIndex(50)
    phonmap = []
    phonlookup = {}

    for i, line in enumerate(open("./cmudict-0.7b-simvecs")):
        word, vec_raw = line.split("  ")
        word = word.lower().rstrip("(0123)")
        vec = [float(v) for v in vec_raw.split()]
        p.add_item(i, vec)

Пример #24

0

Показать файл

Файл: twitter_markov.py Проект: madmatah/twitter_markov

class Twitter_markov(object):

    """Posts markov-generated text to twitter"""

    default_brain = None
    _recently_tweeted = []

    def __init__(self, screen_name, brains=None, **kwargs):

        self.screen_name = screen_name

        self.api = kwargs.get('api', tbu.api.API(screen_name, **kwargs))

        self.config = kwargs.get('config', self.api.config)

        self.logger = logging.getLogger(screen_name)

        try:
            if isinstance(brains, str):
                brains = [brains]

            if not isinstance(brains, list):
                brain = self.config.get('brain', [])
                brains = brain + self.config.get('brains', [])

            if not brains:
                raise RuntimeError

            self.brains = self._setup_brains(brains)

        except (IOError, IndexError, RuntimeError) as e:
            self.logger.error('Feed me brains: unable to find any brains!')
            raise e

        self.logger.debug('Brains: {0}'.format(list(self.brains.keys())))

        self.dry_run = kwargs.get('dry_run', False)

        self.wordfilter = Wordfilter()
        self.wordfilter.add_words(self.config.get('blacklist', []))

        self.checker = checking.construct_tweet_checker(
            no_retweets=self.config.get('no_retweets'),
            no_replies=self.config.get('no_replies')
        )

        if kwargs.get('learn', True):
            self.learn_parent()

    def _setup_brains(self, brains):
        self.logger.debug('setting up brains')
        out = dict()

        try:
            for pth in brains:
                brainpath = os.path.expanduser(pth)
                name = os.path.basename(brainpath).replace('.brain', '')

                if not os.path.exists(brainpath):
                    raise IOError("Brain file '{0}' missing".format(brainpath))

                out[name] = Brain(brainpath)
                out[name].scorer.add_scorer(2.0, scoring.LengthScorer())

        except AttributeError as e:
            self.logger.error(e)
            self.logger.error("Probably couldn't find the brain file.")
            raise e

        except IOError as e:
            self.logger.error(e)
            self.logger.error(brains)
            raise e

        self.default_brain = os.path.basename(brains[0]).replace('.brain', '')

        return out

    @property
    def recently_tweeted(self):
        if len(self._recently_tweeted) == 0:
            recent_tweets = self.api.user_timeline(self.screen_name, count=self.config.get('checkback', 20))
            self._recently_tweeted = [x.text for x in recent_tweets]

        return self._recently_tweeted

    def check_tweet(self, text):
        text = text.strip().lower()

        if len(text) == 0:
            self.logger.info("Rejected (empty)")
            return False

        if not self.checker(text):
            self.logger.info("Rejected (retweet or reply)")
            return False

        if self.wordfilter.blacklisted(text):
            self.logger.info("Rejected (blacklisted)")
            return False

        for line in self.recently_tweeted:
            if text in line.strip().lower():
                self.logger.info("Rejected (Identical)")
                return False

            if Levenshtein.ratio(re.sub(r'\W+', '', text), re.sub(r'\W+', '', line.lower())) >= 0.70:
                self.logger.info("Rejected (Levenshtein.ratio)")
                return False

        return True

    def reply_all(self, brainname=None):
        mentions = self.api.mentions_timeline(since_id=self.api.last_reply)
        self.logger.debug('{0} mentions found'.format(len(mentions)))

        for status in mentions:
            self.reply(status, brainname)

    def reply(self, status, brainname=None):
        self.logger.debug('Replying to a mention')

        if status.user.screen_name == self.screen_name:
            self.logger.debug('Not replying to self')
            return

        catalyst = tbu.helpers.format_status(status)
        text = self.compose(catalyst, brainname, max_len=138 - len(status.user.screen_name))

        reply = u'@' + status.user.screen_name + ' ' + text

        self.logger.info(reply)
        self._update(reply, in_reply=status.id_str)

    def tweet(self, catalyst='', brainname=None):
        self.logger.debug('tweeting')

        text = self.compose(catalyst, brainname)

        self.logger.info(text)
        self._update(text)

    def _update(self, tweet, in_reply=None):
        if not self.dry_run:
            self.api.update_status(status=tweet, in_reply_to_status_id=in_reply)

    def compose(self, catalyst='', brainname=None, max_len=140):
        '''Format a tweet with a reply from brainname'''

        max_len = min(140, max_len)

        brainname = brainname or self.default_brain
        brain = self.brains[brainname]

        reply = brain.reply(catalyst, max_len=max_len)

        self.logger.debug(u'input> ' + catalyst)
        self.logger.debug(u'reply> ' + reply)

        if len(reply) <= 140:
            return reply

        else:
            self.logger.debug('Tweet was too long, trying again')
            return self.compose(catalyst, brainname, max_len)

    def learn_parent(self, brainname=None):
        parent = self.config.get('parent')

        last_tweet = self.api.last_tweet

        if not parent or not last_tweet:
            return

        tweet_filter = checking.construct_tweet_filter(
            no_mentions=self.config.get('filter_mentions'),
            no_hashtags=self.config.get('filter_hashtags'),
            no_urls=self.config.get('filter_urls'),
            no_media=self.config.get('filter_media'),
            no_symbols=self.config.get('filter_symbols')
        )

        tweet_checker = checking.construct_tweet_checker(
            no_badwords=self.config.get('filter_parent_badwords', True),
            no_retweets=self.config.get('no_retweets'),
            no_replies=self.config.get('no_replies')
            )

        tweets = self.api.user_timeline(parent, since_id=last_tweet)

        brain = brainname or self.default_brain

        for status in tweets:
            if not tweet_checker(status):
                continue

            text = tweet_filter(status)

            text = tbu.helpers.format_text(text)

            self.brains[brain].learn(text)

Python Wordfilter примеры использования