예제 #1
0
class TwitterTrainer(Trainer):
    """
    Allows the chat bot to be trained using data
    gathered from Twitter.
    """
    def __init__(self, storage, **kwargs):
        super(TwitterTrainer, self).__init__(storage, **kwargs)
        from twitter import Api as TwitterApi

        self.api = TwitterApi(
            consumer_key=kwargs.get('twitter_consumer_key'),
            consumer_secret=kwargs.get('twitter_consumer_secret'),
            access_token_key=kwargs.get('twitter_access_token_key'),
            access_token_secret=kwargs.get('twitter_access_token_secret'))

    def random_word(self, base_word='random'):
        """
        Generate a random word using the Twitter API.

        Search twitter for recent tweets containing the term 'random'.
        Then randomly select one word from those tweets and do another
        search with that word. Return a randomly selected word from the
        new set of results.
        """
        import random
        random_tweets = self.api.GetSearch(term=base_word, count=5)
        random_words = self.get_words_from_tweets(random_tweets)
        random_word = random.choice(list(random_words))
        tweets = self.api.GetSearch(term=random_word, count=5)
        words = self.get_words_from_tweets(tweets)
        word = random.choice(list(words))
        return word

    def get_words_from_tweets(self, tweets):
        """
        Given a list of tweets, return the set of
        words from the tweets.
        """
        words = set()

        for tweet in tweets:
            # TODO: Handle non-ascii characters properly
            cleaned_text = ''.join(
                [i if ord(i) < 128 else ' ' for i in tweet.text])
            tweet_words = cleaned_text.split()

            for word in tweet_words:
                # If the word contains only letters with a length from 4 to 9
                if word.isalpha() and len(word) > 3 and len(word) <= 9:
                    words.add(word)

        return words

    def get_statements(self):
        """
        Returns list of random statements from the API.
        """
        from twitter import TwitterError
        statements = []

        # Generate a random word
        random_word = self.random_word()

        self.logger.info(
            u'Requesting 50 random tweets containing the word {}'.format(
                random_word))
        tweets = self.api.GetSearch(term=random_word, count=50)
        for tweet in tweets:
            statement = Statement(tweet.text)

            if tweet.in_reply_to_status_id:
                try:
                    status = self.api.GetStatus(tweet.in_reply_to_status_id)
                    statement.add_response(Response(status.text))
                    statements.append(statement)
                except TwitterError as error:
                    self.logger.warning(str(error))

        self.logger.info('Adding {} tweets with responses'.format(
            len(statements)))

        return statements

    def train(self):
        for _ in range(0, 10):
            statements = self.get_statements()
            for statement in statements:
                self.storage.update(statement, force=True)
예제 #2
0
class TwitterTrainer(Trainer):
    """
    Allows the chat bot to be trained using data
    gathered from Twitter.

    :param random_seed_word: The seed word to be used to get random tweets from the Twitter API.
                             This parameter is optional. By default it is the word 'random'.
    :param twitter_lang: Language for results as ISO 639-1 code.
                         This parameter is optional. Default is None (all languages).
    """
    def __init__(self, storage, **kwargs):
        super(TwitterTrainer, self).__init__(storage, **kwargs)
        from twitter import Api as TwitterApi

        # The word to be used as the first search term when searching for tweets
        self.random_seed_word = kwargs.get('random_seed_word', 'random')
        self.lang = kwargs.get('twitter_lang')

        self.api = TwitterApi(
            consumer_key=kwargs.get('twitter_consumer_key'),
            consumer_secret=kwargs.get('twitter_consumer_secret'),
            access_token_key=kwargs.get('twitter_access_token_key'),
            access_token_secret=kwargs.get('twitter_access_token_secret'))

    def random_word(self, base_word, lang=None):
        """
        Generate a random word using the Twitter API.

        Search twitter for recent tweets containing the term 'random'.
        Then randomly select one word from those tweets and do another
        search with that word. Return a randomly selected word from the
        new set of results.
        """
        import random
        random_tweets = self.api.GetSearch(term=base_word, count=5, lang=lang)
        random_words = self.get_words_from_tweets(random_tweets)
        random_word = random.choice(list(random_words))
        tweets = self.api.GetSearch(term=random_word, count=5, lang=lang)
        words = self.get_words_from_tweets(tweets)
        word = random.choice(list(words))
        return word

    def get_words_from_tweets(self, tweets):
        """
        Given a list of tweets, return the set of
        words from the tweets.
        """
        words = set()

        for tweet in tweets:
            tweet_words = tweet.text.split()

            for word in tweet_words:
                # If the word contains only letters with a length from 4 to 9
                if word.isalpha() and len(word) > 3 and len(word) <= 9:
                    words.add(word)

        return words

    def get_statements(self):
        """
        Returns list of random statements from the API.
        """
        from twitter import TwitterError
        statements = []

        # Generate a random word
        random_word = self.random_word(self.random_seed_word, self.lang)

        self.chatbot.logger.info(
            'Requesting 50 random tweets containing the word {}'.format(
                random_word))
        tweets = self.api.GetSearch(term=random_word, count=50, lang=self.lang)
        for tweet in tweets:
            statement = Statement(tweet.text)

            if tweet.in_reply_to_status_id:
                try:
                    status = self.api.GetStatus(tweet.in_reply_to_status_id)
                    statement.in_response_to = status.text
                    statements.append(statement)
                except TwitterError as error:
                    self.chatbot.logger.warning(str(error))

        self.chatbot.logger.info('Adding {} tweets with responses'.format(
            len(statements)))

        return statements

    def train(self):
        for _ in range(0, 10):
            statements = self.get_statements()
            for statement in statements:
                self.chatbot.storage.create(
                    text=statement.text,
                    in_response_to=statement.in_response_to,
                    conversation=statement.conversation,
                    tags=statement.tags)
예제 #3
0
twitter = TwitterApi(**twitter_params)

con = sqlite3.connect('../archive.db')
cur = con.cursor()

for tweet_id, text in cur.execute('select tweet_id, content from tweets'):
    if os.path.exists('archive_images/{}.jpg'.format(tweet_id)):
        print('skipping {}'.format(tweet_id))
        continue
    if os.path.exists('archive_images/{}.jpg'.format(tweet_id)):
        print('getting details {}'.format(tweet_id))
        continue
    if 'balance of $0.00' in text or 'not found' in text:
        print('skipping(notix) {}'.format(text))
        continue
    tweet = twitter.GetStatus(tweet_id, include_entities=True)
    time.sleep(.5)
    if tweet.media:
        if os.path.exists('archive_images/{}.jpg'.format(tweet_id)):
            print('downloading {}'.format(tweet_id))
            continue
        resp = requests.get(tweet.media[0].media_url, stream=True)
        if resp.status_code == 200:
            with open('archive_images/{}.jpg'.format(tweet_id),
                      'wb') as file_obj:
                resp.raw.decode_content = True
                shutil.copyfileobj(resp.raw, file_obj)
    else:
        print('skipping(nomedia) {}'.format(tweet_id))

con.close()