Пример #1
0
 def test_get_hashtags(self):
     self.assertEqual(
         Tweet(self.tweets[0]).get_hashtags(), ["spark", "apache"])
     self.assertEqual(Tweet(self.tweets[1]).get_hashtags(), [])
     self.assertEqual(
         Tweet(self.tweets[2]).get_hashtags(),
         ["spark\"\/", "apache  storm"])
Пример #2
0
def main():
    TIME =  60 * 60 # 60 min * 60s --> s
    # getting twitter keys 
    CONSUMER_KEY = environ['CONSUMER_KEY']
    CONSUMER_SECRET = environ['CONSUMER_SECRET']
    ACCESS_TOKEN = environ['ACCESS_KEY']
    ACCESS_TOKEN_SECRET = environ['ACCESS_SECRET']

    # init bot
    bot = Bot(CONSUMER_KEY=CONSUMER_KEY, CONSUMER_SECRET=CONSUMER_SECRET,
              ACCESS_TOKEN=ACCESS_TOKEN, ACCESS_TOKEN_SECRET=ACCESS_TOKEN_SECRET)
    # init tracker (database api call)
    tracker = Tracker()
    # tweet init 
    tweet = Tweet(totalDeaths=(tracker.getTotalDeaths()),
                  totalInfected=(tracker.getTotalInfected()))

    while True:
        # Get latest data from Tracker
        tracker.update()
        # Generate tweet with latest data
        tweet.update(totalDeaths=(tracker.totalDeaths),
                     totalInfected=(tracker.totalInfected))
        
        # Get old tweets
        oldTweets = bot.getOldTweets()
        # Check if tweet is not duplicated
        if (tweet.isDuplicated(oldTweets=oldTweets) == False):
            bot.postTweet(text=(tweet.text))

        time.sleep(TIME) #s 
Пример #3
0
 def test_clean_text(self):
     self.assertEqual(
         Tweet(self.tweets[0]).get_clean_text(),
         "Spark Summit East this week! #Spark #Apache")
     self.assertEqual(
         Tweet(self.tweets[1]).get_clean_text(),
         "I'm at Terminal de Integrao do Varadouro in Joo Pessoa,   \PB https://t.co/HOl34REL1a"
     )
Пример #4
0
def test_get_last_post_inavlid_creds():
    access_keys = {
        'TWEEPY_CONSUMER_KEY': None,
        'TWEEPY_CONSUMER_SECRET': None,
        'TWEEPY_ACCESS_TOKEN': None,
        'TWEEPY_ACCESS_TOKEN_SECRET': None
    }
    new_tweet = Tweet(access_keys)
    result = new_tweet.get_last_post()
    assert result.response is None
Пример #5
0
def test_get_last_post_inavlid_creds2():
    access_keys = {
        'TWEEPY_CONSUMER_KEY': 'None',
        'TWEEPY_CONSUMER_SECRET': 'None',
        'TWEEPY_ACCESS_TOKEN': 'None',
        'TWEEPY_ACCESS_TOKEN_SECRET': 'None'
    }
    new_tweet = Tweet(access_keys)
    result = new_tweet.get_last_post()
    assert result.args[0][0]['code'] == 89
    assert result.args[0][0]['message'] == 'Invalid or expired token.'
Пример #6
0
 def __init__(self, file, stem=False, **kwargs):
     df = pd.read_csv(file)
     tweets = []
     for index, row in df.iterrows():
         tweets.append(Tweet(row['tweet']))
         
     p = Preprocessor(stem=stem) if 'threshold' not in kwargs else Preprocessor(stem=stem,treshold=kwargs['threshold']/len(tweets))
     p.process(tweets)
     
     del p
     
     #Remove blank tweets
     tweets = [t for t in tweets if len(t.text) > 0]
     
     #Change n-grams to string
     for i in range(len(tweets)):
         tweets[i].n_grams = [" ".join(x) for x in tweets[i].n_grams]
     
     self.__word_list = set()
     actual_word = dict()
     for tw in tweets:
         self.__word_list = self.__word_list.union(list(map(lambda a: a[0], tw.text)) + tw.n_grams) 
         for w in tw.text:
             actual_word[w[0]] = w[1]
             
     print('tweet-size = ', len(tweets))
     print('word-size = ',len(self.__word_list))
     
     
     self.__word_list = list(self.__word_list)
     print('Sample words: ', self.__word_list[:5])
     
     term_count = defaultdict(lambda: defaultdict(lambda: 0))
     for i,tw in enumerate(tweets):
         for w in list(map(lambda a: a[0], tw.text)) + tw.n_grams: #:
             term_count[i][w] += 1
     
     self.__tweet_count = len(tweets)
     
     self.actual_words = []
     for w in self.__word_list:
         if w in actual_word: 
             self.actual_words.append(actual_word[w])
         
     
     tf = []
     for i in range(self.__tweet_count):
         tmp = []
         for w in self.__word_list:
             tmp.append(term_count[i][w])
         del term_count[i]
         tf.append(tmp)
 
     self.__term_freq = tf
     
     self.__results = list(df['sentiment']) # Sentiment 1 if positive, -1 if negative
Пример #7
0
def load_tweets_from_file(tweets_file_name):
    file = codecs.open(tweets_file_name, mode='r', encoding='utf-8')
    content = file.readlines()
    content = [x.strip() for x in content]
    current_line_count = 0
    tweets = []
    for line in content:
        current_line_count += 1
        try:
            tweets.append(Tweet(line))
        except InvalidTweetLine as error:
            logger.error('Invalid tweet at line %d' % current_line_count)
    return tweets
Пример #8
0
def send(tweet_json):
    try:
        tweet = Tweet(tweet_json)
        if tweet.media is not None:
            if len(tweet.media) == 1:
                media_obj = tweet.media[0]
                media_obj.update({"caption": tweet.text})
                return TelegramMediaProxy.send_media(media_obj.get("type"),
                                                     media_obj)
            elif len(tweet.media) > 1:
                tweet.media[0].update({"caption": tweet.text})
                TelegramMediaProxy.send_media("group_media", tweet.media)
        else:
            TelegramMediaProxy.send_media("message", {"text": tweet.text})
    except Exception as e:
        logger.error("can't send tweet to telegram error: \n\t{}".format(e))
Пример #9
0
 def getSentiment(self, tweet):
     tw = Tweet(tweet)
     stop_words = set(stopwords.words('english'))
     tokens = [w for w in tw.text + tw.emoji if not w in stop_words]
     if self.stemmer:
         tokens = [ self.stemmer.stem(w) for w in tokens ]
         
     if self.ngrams: tokens += [" ".join(z) for y in tw.n_grams for z in y[random.randrange(len(y))]]
     
     
     sentiment = 0
     if len(tokens) == 0: return sentiment
     for t in tokens:
         sentiment += self.word_sentiment[t]
     return sentiment/len(tokens)
     
     
     
     
     
     
         
Пример #10
0
 def test_is_tweet_ascii(self):
     self.assertEqual(Tweet(self.tweets[0]).is_tweet_ascii(), True)
     self.assertEqual(Tweet(self.tweets[1]).is_tweet_ascii(), False)
Пример #11
0
def test_default_post():
    access_keys = ''
    new_tweet = Tweet(access_keys)
    result = new_tweet.get_default_post()
    assert result == "Thank you for following MaikuOnline! 毎日頑張りましょう!"
Пример #12
0
from src.tweet import Tweet
from src.preprocessing import Preprocessor
import pandas as pd

df = pd.read_csv('dataset/sample_sts.txt')
tweets = []
for index, row in df.iterrows():
    tweets.append(Tweet(row['tweet']))

p = Preprocessor()

p.process(tweets)

with open('dataset/processed_sample_sts.txt', 'w') as fp:
    for tweet in tweets:
        fp.write(str(tweet.text) + '\n')
Пример #13
0
    def setUp(self):
        self.format = "%a %b %d %H:%M:%S +0000 %Y"

        self.tweets_json = [
            """
              {"created_at":"Thu Oct 29 17:51:01 +0000 2015",
               "text":"Spark Summit East this week! #Spark #Apache",
               "entities":{"hashtags":[{"text":"Spark","indices":[29,35]},{"text":"Apache","indices":[36,43]}],"urls":[],"user_mentions":[],"symbols":[]}}
           """,
            """
              {"created_at":"Thu Oct 29 17:51:30 +0000 2015",
               "text":"Just saw a great post on Insight Data Engineering #Apache #Hadoop #Storm",
               "entities":{"hashtags":[{"text":"Storm","indices":[29,35]},{"text":"Apache","indices":[36,43]},{"text":"Hadoop","indices":[37,48]}],"urls":[],"user_mentions":[],"symbols":[]}}
           """,
            """
              {"created_at":"Thu Oct 29 17:51:55 +0000 2015",
               "text":"Doing great work #Apache",
               "entities":{"hashtags":[{"text":"Apache","indices":[29,35]}],"urls":[],"user_mentions":[],"symbols":[]}}
           """,
            """
              {"created_at":"Thu Oct 29 17:51:56 +0000 2015",
               "text":"Excellent post on #Flink and #Spark",
               "entities":{"hashtags":[{"text":"Flink","indices":[29,35]},{"text":"Spark","indices":[29,35]}],"urls":[],"user_mentions":[],"symbols":[]}}
           """,
            """
              {"created_at":"Thu Oct 29 17:51:59 +0000 2015",
               "text":"New and improved #HBase connector for #Spark",
               "entities":{"hashtags":[{"text":"HBase","indices":[29,35]},{"text":"Spark","indices":[29,35]}],"urls":[],"user_mentions":[],"symbols":[]}}
           """,
            """
              {"created_at":"Thu Oct 29 17:52:05 +0000 2015",
               "text":"New 2.7.1 version update for #Hadoop #Apache",
               "entities":{"hashtags":[{"text":"Hadoop","indices":[29,35]},{"text":"Apache","indices":[29,35]}],"urls":[],"user_mentions":[],"symbols":[]}}
           """,
            """
              {"created_at":"Thu Oct 29 17:52:31 +0000 2015",
               "text":"Try to evict the storm vertex",
               "entities":{"hashtags":[],"urls":[],"user_mentions":[],"symbols":[]}}
           """,
            """
              {"created_at":"Thu Oct 29 17:52:56 +0000 2015",
               "text":"New 2.7.1 version update for #Unrelated",
               "entities":{"hashtags":[{"text":"Unrelated","indices":[29,35]}],"urls":[],"user_mentions":[],"symbols":[]}}
           """,
            """
              {"created_at":"Thu Oct 29 17:52:57 +0000 2015",
               "text":"Excellent post on #Flink and #Spark",
               "entities":{"hashtags":[{"text":"Flink","indices":[29,35]},{"text":"Spark","indices":[29,35]}],"urls":[],"user_mentions":[],"symbols":[]}}
           """,
            """
              {"created_at":"Thu Oct 29 17:54:57 +0000 2015",
               "text":"Another Excellent post on #Flink",
               "entities":{"hashtags":[{"text":"Flink","indices":[29,35]}],"urls":[],"user_mentions":[],"symbols":[]}}
           """,
        ]

        self.tweets = [
            Tweet(json.loads(cur_tweet_json))
            for cur_tweet_json in self.tweets_json
        ]
        self.timestamps = [
            datetime.strptime(tweet.get_timestamp(), self.format)
            for tweet in self.tweets
        ]