Exemplo n.º 1
0
    def download_tweets(self):
        # Authentifikation bei der Twitter API
        consumer_key = "***"
        consumer_secret = "***"
        access_token = "***"
        access_token_secret = "***"
        auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
        auth.set_access_token(access_token, access_token_secret)
        api = tweepy.API(auth)

        # Festlegen des Suchbegriffs und der Größe der Rückgabemenge
        search_term = "#ExtinctionRebellion"
        quantity_tweets = 100

        # Definition der Suchparameter
        self.tweets = tweepy.Cursor(
            api.search, q=search_term, lang="en", tweet_mode="extended"
        ).items(quantity_tweets)

        # Speichern der erhaltenen Tweets und ihrer Attribute in der Datenbank
        sql = SQLData()
        for api_tweet in self.tweets:
            # wenn Textinhalt mit "RT" beginnt diesen Tweet nicht speichern
            if not api_tweet.full_text.startswith("RT"):
                tweet = Tweet(
                    api_tweet.id,
                    api_tweet.created_at,
                    str(datetime.datetime.now()),
                    str(datetime.datetime.now()),
                    api_tweet.user.screen_name,
                    api_tweet.full_text,
                )
                sql.insert_tweet(tweet)
            else:
                print(f"Tweet mit der ID={api_tweet.id} ist ein Retweet")
Exemplo n.º 2
0
def display_create_tweet():
    # Si la méthode est de type "GET"
    if request.method == 'GET':
        # On affiche notre formulaire de création
        return render_template('create_tweet.html')
    else:
        # Sinon, notre méthode HTTP est POST
        # on va donc créer un nouveau tweet
        # récupération du nom de l'auteur depuis le corps de la requête
        authorName = request.form['author']
        # récupération du contenu depuis le corps de la requête
        content = request.form['content']
        # Création d'une variable image par défaut vide.
        image = None
        # récupération de l'image depuis le corps de la requête
        f = request.files['image']
        # Si il y a bel et bien une image d'uploadé
        if f.filename != '':
            # On construit le chemin de destination de notre image (où est-ce qu'on va la sauvegarder)
            filepath = os.path.join(app.root_path, 'static', 'uploads',
                                    f.filename)
            # On sauvegarde notre image dans ce chemin
            f.save(filepath)
            # création de l'url de l'image pour son affichage (à l'aide de son nom)
            image = url_for('static', filename='uploads/' + f.filename)
        # Création d'un tweet à l'aide de notre constructeur (qui se trouve dans le fichier tweet.py)
        tweet = Tweet(authorName, content, image)
        # Insertion de notre tweet en première position dans notre tableau
        tweets.insert(0, tweet)
        # Redirection vers la liste de nos tweets
        return redirect(url_for('display_tweets'))
Exemplo n.º 3
0
 def collect_tweets(self):
     """
     # of hashtags per tweet
     # of tweets with hashtags
     # mentions per tweet
     # of tweets with mentions
     URLs per tweet
     Tweets with URLs
     # special characters per tweet
     # Tweets with special characters
     Retweets by user
     Inter-tweet content similarity: Bag of Words w/ Jaccard and cosine similarity
     Duplicate tweets
     Duplicate URLs ratio (1-unique URLs/total URLs)
     Duplicate Domains Ratio (1-unique domains/total domains)
     Duplicate Mentions Ratio (1-unique mentions/ total mentions)
     Duplicate hashtags ratio (1-unique hashtags/total hashtags)
     """
     for twt in self.f:
         tweet = Tweet()
         tweet.get_features(twt)
         self.tweets.append(tweet)
         self.tweet_timings.append(tweet.date)
         self.tweet_text.append(tweet.html_text)
         self.app_sources.append(tweet.source)
         self.retweet_sources.append(tweet.rts)
         for url in tweet.urls:
             self.urls.append(url['expanded_url'])
             self.domains.append(url['display_url'].split('/')[0])
         for mention in tweet.mentions:
             self.mentions.append(mention['id'])
         for hashtag in tweet.hashtags:
             self.hashtags.append(hashtag['text'])
Exemplo n.º 4
0
    def tweets(self):
        tweets = []

        for tweet_obj in self.db:
            tweets.append(Tweet(tweet_obj))

        return tweets
Exemplo n.º 5
0
    def get_tweets(self, count, lang, **kwargs):
        """ Get's tweets from twitter and returns them in a list. By default returns a single tweet
          in english. To change language, pass the language code in the parameters.

        Args:
            count: Number of tweets to be returned.
            lang: Language flag by default its set to english (en), pass the
                    language code to change language.
            **kwargs: Arbitrary keyword arguments.

        Returns:
            List of tweets, in json format.
        """
        tweets = []
        try:
            if "filters" in kwargs:
                stream = self._get_filter(kwargs["filters"], kwargs)
            else:
                stream = self._get_iterator()
            for tweet in stream:
                tweets.append(Tweet(json.dumps(tweet)))
                if count - 1 <= 0:
                    break
            return tweets
        except Exception as e:
            raise e
Exemplo n.º 6
0
def download_tweets(query_phrase, tweet_count):
    import api_keys
    import tweepy
    from datetime import datetime
    from tweet import Tweet
    # authorization
    auth = tweepy.AppAuthHandler(api_keys.API_KEY, api_keys.API_SECRET)
    api = tweepy.API(
        auth,
        wait_on_rate_limit=True,
        # wait until the limit is replenished
        wait_on_rate_limit_notify=True)
    # reply with a message if the limit is reached

    # check if not authorized
    if (not api):
        print("Can't Authenticate")
        return

    tweets = []
    for status in tweepy.Cursor(api.search,
                                q=query_phrase,
                                tweet_mode='extended',
                                lang='en').items(tweet_count):
        try:
            full_text = status._json['retweeted_status']['full_text']
        except:
            full_text = status._json['full_text']

        ts = datetime.strptime(status._json['created_at'],
                               '%a %b %d %H:%M:%S +0000 %Y')
        tweets.append(Tweet(full_text, ts))
    return tweets
Exemplo n.º 7
0
    def _get_tweets_by_query(self, query: str, searchKey: str) -> list:
        """
            Recieves tweets from database with given query.

                Args:
                    `query` (str): SQLite query that will be executed
                                   (e.g. `"SELECT * FROM Tweet"`)
                    `searchKey` (str): search key

                Returns:
                    A list that contains Tweet instances created from executed
                    `query`
        """
        self.c.execute(query)
        tweets = []
        for row in self.c.fetchall():
            tweets.append(Tweet(tweet_id=row[0],
                                writer=row[1],
                                post_date=time.localtime(row[2]),
                                body=row[3],
                                searchKey=searchKey,
                                comment_num=row[4],
                                retweet_num=row[5],
                                like_num=row[6]))

        return tweets
Exemplo n.º 8
0
    def post(self):
        self.response.headers["Content-Type"] = "text/html"

        share_text = self.request.get("share_text")
        share_image = self.request.get("share_image")

        if share_text != None or share_text != "":

            share_type = self.request.get("share_type")

            if share_type == "Update":

                edit_tweet_id = self.request.get("edit_tweet_id")
                edit_tweet = Services().get_tweet(tweet_id=edit_tweet_id)
                edit_tweet.share_text = share_text

                edit_tweet.put()

            else:

                myuser = Services().get_login_user()
                tweet = Tweet(share_text=share_text,
                              user_id=myuser.key.id(),
                              user_name=myuser.user_name,
                              time=datetime.datetime.now())
                tweet.put()

                myuser.tweet_ids.append(tweet.key.id())
                myuser.put()

        self.redirect("/")
Exemplo n.º 9
0
def tweetScraper(driver):
    try:
        tweeterDivs = driver.page_source
        obj = BeautifulSoup(tweeterDivs,
                            "html.parser")  #extracting the page source
        allTweets = obj.find_all("div", class_="tweet")
        tweetList = []

        for tweet in allTweets:

            tweetText = tweet.find(
                "p", class_="tweet-text").getText()  #extracting tweet text
            screenName = tweet.find(
                "strong", class_="fullname").getText()  #extracting Screen name
            username = tweet.find(
                class_="username").getText()  #extracting username
            tweetId = tweet['data-tweet-id']  #extracting tweet id
            userId = tweet['data-user-id']  #extracting user id
            timestamp = tweet.find(
                class_="tweet-timestamp")['title']  #extracting tweet time

            timestamp = datetime.datetime.strptime(
                timestamp.split('-')[1], " %d %b %Y")
            timestamp = str(timestamp.date())
            tweetList.append(
                Tweet(tweetId, tweetText, username, userId, screenName,
                      timestamp))

    except Exception as e:
        print("Something went wrong!")
        driver.quit()

    return tweetList
Exemplo n.º 10
0
def getTestdata(search="*", count=1, emojis=None):
    auth = tweepy.OAuthHandler(
        "v22l2KMtXLJY3ZTiEpNyRyLUj",
        "2GCvf1ul33i0eyGyNq6Uo6oWeSL4gmUfyghnlFKHxMU9D0SyuL")
    auth.set_access_token("887755823522340865-8F9qeIWfm6fzYPpI4mJVXXq1iuFgCcm",
                          "VgLvgj015uajs3vzHdX3vSi3jIPNfZP03flzI7CIjOtqk")

    api = tweepy.API(auth)

    search = "sad"
    itemlimit = count

    for status in tweepy.Cursor(api.search,
                                lang="en",
                                q=search,
                                tweet_mode="extended",
                                since_id=1).items(itemlimit):
        # process status here
        # print status.entities["hashtags"]

        if "retweeted_status" in dir(status):
            tweet = status.retweeted_status.full_text
        else:
            tweet = status.full_text

        t1 = Tweet(tweet)
        t1.processTweet(emojis=emojis)
        t1.printer()
Exemplo n.º 11
0
def getTraindata(bpfile="Datasets/Train/Sentiment Analysis Dataset.csv",
                 mpfile="Datasets/Train/smileannotationsfinal.csv",
                 mode="mp",
                 emojis=None):
    mpdata = []
    bpdata = []

    if mode == "mp":
        file = mpfile
    else:
        file = bpfile

    fp = open(file, "r")

    for line in fp:
        tokens = line.split(',')

        labels = tokens[2].split('|')
        if labels[0] in Emotions:
            label = Emotions[labels[0]]
            t1 = Tweet(tokens[1], label)
            t1.processTweet(emojis=emojis)
            mpdata.append(t1)

    fp.close()

    return mpdata
Exemplo n.º 12
0
    def fetch_tweet(self, tweet_id: int):
        tweet_set = self.r.hgetall('tweet:' + str(tweet_id))
        user_id = tweet_set.get('user_id')
        timestamp = tweet_set.get('timestamp')
        tweet_txt = tweet_set.get('tweet_txt')

        return Tweet(user_id, timestamp, tweet_txt)
Exemplo n.º 13
0
 def createATweet(self,tweet):
     try:
         newT = Tweet(tweet)
         return newT
     except Exception as e:
         print (e)
         return None
Exemplo n.º 14
0
def get_tweets(search_string, result_type, time_range):

    api = twitter.Api(
        consumer_key='7SVoyHlwYgm90Y5HSzmTzUQ9O',
        consumer_secret='D0crcrKca9S3TXuqGRPYhzBN0LJut34MecER8Ly8fb3xrM0Gja',
        access_token_key='1199488399238926336-1jV9xq8bs4zdP5qiq96cUwP5GF1Fuz',
        access_token_secret='71hibaH8BPkPwCytm5CH9N4RJonaRCrSKUqG9y3dwo2Ix')

    tweets = {}
    search = "q=" + str(search_string) + "%20&result_type=" + str(
        result_type) + "&since=" + str(time_range) + "&count=100"
    print(search)
    results = api.GetSearch(raw_query=search)
    i = 0
    for result in results:
        json_result = json.loads(str(result))
        t = Tweet(json_result['user']['profile_image_url'],
                  json_result['user']['name'], json_result['text'],
                  json_result['created_at'], json_result['hashtags'])
        if 'retweet_count' in json_result:
            t.retweet_count = json_result['retweet_count']
        if 'favorite_count' in json_result:
            t.favorite_count = json_result['favorite_count']
        tweets[i] = t
        i += 1

    return tweets
Exemplo n.º 15
0
 def _cast_row_to_tweet(self, row):
     tweet = Tweet(row[0], row[1], row[2], row[3], row[4], row[5])
     tweet.label = row[6]
     tweet.clean_text = row[7]
     tweet.tb_polarity = row[8]
     tweet.nb_polarity = row[9]
     return tweet
Exemplo n.º 16
0
    def average_degree(self):
        try:
            stats_fh = open(self.output_file, 'w')
        except IOError:
            print 'Cannot open', self.output_file

        try:
            tweet_fh = open(self.input_file)
        except IOError:
            print 'Cannot open', self.input_file
        else:
            tweets = tweet_fh.readlines()

            graph = tweet_graph()

            for tweet_line in tweets:
                tweet_dec = json.loads(tweet_line)

                # Ignore tweets with "limit"
                if "limit" in tweet_dec.keys():
                    continue

                cur_tweet = Tweet(tweet_dec)
                hashtags = cur_tweet.get_hashtags()

                cur_ts = datetime.strptime(cur_tweet.get_timestamp(),
                                           self.format)

                # Ignore tweets with one or zero hashtags
                # It will only be used to evict old tweets from the graph
                if (len(hashtags) >= 2):
                    for hashtag in hashtags:
                        graph.add_vertex(hashtag, cur_ts)

                    edges = self.pairwise(hashtags)

                    for edge in edges:
                        graph.add_edge(graph.get_vertex(edge[0]),
                                       graph.get_vertex(edge[1]), cur_ts)

                else:
                    graph.evict(cur_ts)

                av_degree = graph.average_degree()
                stats_fh.write(("%0.2f" % av_degree) + "\n")

                if self.tracker_en:
                    self.ad_tracker.append(av_degree)
                    (peak_degree, peak_node) = graph.peak_degree()
                    self.pd_tracker.append(peak_degree)
                    self.pn_tracker.append(peak_node)

                if self.self_checking:
                    if not graph.check_graph(cur_ts):
                        print "Self Checking Failed at " + str(cur_ts)

            tweet_fh.close()

        if not stats_fh.closed:
            stats_fh.close()
Exemplo n.º 17
0
 def test_filter1(
         self
 ):  #This test should pass, and is meant to establish a baseline.
     tweet_1 = Tweet("@trapkingwillie", 1, 12, "PM",
                     "Eastern Time (US & Canada)", 10, 15,
                     "This should pass", "n/a")
     self.assertEqual(tweet_1.msg, "This should pass")
Exemplo n.º 18
0
def make_post(tweet_data, curr_user):
    # t_data_split = tweet_data.split(':')
    '''tweet_data contains:
        tweet_data[0] == "POST": used for menu checking commands
        tweet_data[1] == Author of tweet
        tweet_data[2] == Tweet message: the tweet content string
        tweet_data[3] == hashtags: the hash tags separated by space
    '''
    tweet_author = curr_user
    tweet_content = tweet_data[2]
    htags = tweet_data[3].split()
    #construct the tweet to add
    tweet_to_add = Tweet(tweet_author, tweet_content, htags)
    #add to user's tweet list
    curr_user.tweets.append(tweet_to_add)
    #add to global tweet list
    allTweets.append(tweet_to_add)

    #add to follower's list to display
    for f in tweeterlist:
        if f in curr_user.subs:
            if f.status is False:
                f.offline_tweets.append(tweet_to_add)
            else:
                f.tweets.append(tweet_to_add)

    sdata = "SUCCESS:POST"
    return sdata
Exemplo n.º 19
0
def lambda_handler(event, context):
    try:
        # Get data from the database
        raw_data = get_words_from_database()

        # Transform the raw data
        data = transform_data_from_database(raw_data)
 
        # Get the keys needed for the Twitter API
        access_keys = get_twitter_account_info()

        # Create a new Tweet instance
        new_tweet = Tweet(access_keys)

        # Get last twenty timeline posts
        timeline_posts = new_tweet.get_last_twenty_posts()

        # Get a random word
        message = get_random_word(data, timeline_posts)
        print(f'message to post = {message}')

        # Post new message to Twitter account
        new_tweet.create_new_post(message)
    except Exception as error:
        message = f'Error occurred during invocation of lambda function. Error = {error}'
        print(message)
Exemplo n.º 20
0
def fetch(query,
          output_file=sys.stdout,
          debug_file=None,
          lang="en",
          geocode="",
          max_count=500000):
    '''
    Fetches query results into output_files, and prints raw json results into debug_file
    '''
    auth = OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
    auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
    api = API(auth,
              retry_count=10,
              retry_delay=15,
              timeout=60,
              wait_on_rate_limit=True,
              wait_on_rate_limit_notify=True)

    print("QUERY:[", query, "]", "OUTPUT:", output_file.name, file=output_file)
    count = 0
    ok_count = 0
    for result in Cursor(api.search, q=query, lang=lang).items(max_count):
        if debug_file: print(result.text + "\n", file=debug_file)
        tweet = Tweet(result.text)
        t = tweet.preprocess()
        if t and tweet.isTagged():
            print(",".join(tweet.hashtags) + "\t" + t, file=output_file)
            ok_count += 1
        count += 1
        if count % 1000 == 0: print("tweets saved:", ok_count, "/", count)
    print("Loop end:", ok_count, "/", count, "tweets saved")
Exemplo n.º 21
0
def loadTweets(filename):
    """ Load tweets from a filename.
    Returns a list of Tweet objects.
    """
    tweets = open(filename, 'r').read().splitlines()
    print "Loading %d tweets from %s ..." % (len(tweets), filename)
    tweetObjects = []
    for tweet in tweets:
        try:
            js = json.loads(tweet)
            if (not ('place' in js)) or js['place'] == None:
                continue
            elif (not ('full_name' in js['place'])):
                continue
            elif (not ('geo' in js)) or js['geo'] == None:
                continue
            elif (not ('coordinates' in js['geo'])):
                continue
            coords = js['geo']['coordinates']
            place = js['place']
            tweetObject = Tweet(js['text'], place['full_name'], coords[0],
                                coords[1], place['country'], js['created_at'])
            tweetObjects.append(tweetObject)
        except ValueError:
            pass
    print "Loaded %d tweets" % len(tweetObjects)
    return tweetObjects
Exemplo n.º 22
0
def main():
    t = Tweet()
    t.prompt()

    try:
        # validate the user name
        t.validateUser(t.userName)
        t._clear()
        # obtain user tweets
        t.getTweets(t.userName, t.rt)

        # check if file exsist. create if doesn't and clean if exsists
        t.exist(t.jsonFile)

        # dump to json file
        t.dumpJson(t.jsonFile)
        print("Complete!!")

        sleep(0.5)
        t._clear()
        # welcome user
        print(user_prompts.welcome.format(t.userName))

        # ask user what to view
        t.view()

    except tweepy.TweepError as t:
        print(t.args[0])
Exemplo n.º 23
0
    def tweets(self, limit=10):
        tweets = []

        for item in self.collection.find().sort('received_at', desc).limit(limit):
            tweet_obj = item
            tweets.append(Tweet(tweet_obj))
        return tweets
Exemplo n.º 24
0
def getTestdata(search , count=1, emojis = None):
	auth = tweepy.OAuthHandler("VzAcT7Kf0gAiYBwN9CeKQolqk", "oU80dLgLahfNHS0b7pUZv3EC4MeRZ1UHnpMVbluAlLlSYot4Y0")
	auth.set_access_token("810807258149949440-WWflrBW2sY7iruVQLjN70dCcn1BUoCf", "mDG1BDjVEMJO4QEch1bUEjnsWuvbhqHeQVRi4wwveTgB0")

	api = tweepy.API(auth)
	non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)

	#search = "happy"
	itemlimit = count

	for status in tweepy.Cursor(api.search, lang="en", q=search,tweet_mode="extended", since_id=1).items(itemlimit):
	    # process status here
	    # print status.entities["hashtags"]

	    if "retweeted_status" in dir(status):
	    	tweet=status.retweeted_status.full_text
	    else:
	    	tweet=status.full_text
	

	    t1 = Tweet(tweet.translate(non_bmp_map))
	    t1.processTweet(emojis = emojis)
	    t1.printer()
	    
	    return t1
Exemplo n.º 25
0
def index():
    REQUESTS.inc()
    with EXECPTIONS.count_exceptions():
        LAST.set(time.time())
        INPROGRESS.inc()
        start = time.time()
        if request.method == 'POST':
            try:
                query = request.form['query']
                query_vec = vectorizer.transform(
                    [query])  #(n_docs,x),(n_docs,n_Feats)
                results = cosine_similarity(X, query_vec).reshape(
                    (-1, ))  #Cosine Sim with each doc
                tweets = []
                for i in results.argsort()[-20:][::-1]:
                    tweets.append(
                        Tweet(df.iloc[i, 0], df.iloc[i, 2], df.iloc[i, 3]))
                INPROGRESS.dec()
                lat = time.time()
                LATENCY.observe(lat - start)
                return render_template('Home.html', query=query, tweets=tweets)
            except:
                raise Exception

        try:
            INPROGRESS.dec()
            lat = time.time()
            LATENCY.observe(lat - start)
            LATENCY_HIS.observe(lat - start)
            return render_template('Home.html')
        except:
            raise Exception
Exemplo n.º 26
0
def getTraindata(bpfile = "Datasets/Train/Sentiment Analysis Dataset.csv", mpfile = "Datasets/Train/smileannotationsfinal.csv", mode = "mp" ,emojis = None):
        mpdata = []
        bpdata = []

        if mode == "bp":
                file = mpfile
        else:
                file = bpfile

        fp = open(file, encoding="utf-8",errors="ignore")
        non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
        i=1
        for line in fp:
                #line1=line.translate(non_bmp_map)
                tokens = line.split(',')
                labels = tokens[2].split('|')
                if labels[0] in Emotions:
                        label = Emotions[labels[0]]
                        t1 = Tweet(tokens[1], label)
                        t1.processTweet(emojis = emojis)
                        #print (t1.text)
                        mpdata.append(t1)
                i=i+1
        print ("Number of data",i)
                #print(mpdata)

        fp.close()
##        token=mpdata.split(',')
##        print(len(token))
       # print(len(mpdata))
        return mpdata
Exemplo n.º 27
0
    def __init__(self, filename):
        tweets = []
        positiveproportion = 0
        negativeproportion = 0
        neutralproportion = 0
        with open(filename) as f:
            reader = csv.reader(f)
            for row in reader:
                tweet = Tweet(row)
                tweets.append(tweet)
                if tweet.sentiment > 0:
                    positiveproportion += 1
                elif tweet.sentiment < 0:
                    negativeproportion += 1

        positiveproportion /= len(tweets)
        negativeproportion /= len(tweets)
        neutralproportion = 1 - positiveproportion - negativeproportion
        self.sentimentfractions = [
            positiveproportion, negativeproportion, neutralproportion
        ]
        self.tweets = tweets
        self.positivesorted = sorted(tweets,
                                     key=lambda i: i.sentiment,
                                     reverse=True)
        self.negativesorted = sorted(tweets, key=lambda i: i.sentiment)
Exemplo n.º 28
0
    def post(self):
        self.response.headers["Content-Type"] = "text/html"

        text_share = self.request.get("text_share")
        share_image = self.request.get("share_image")

        if text_share != None or text_share != "":

            share_type = self.request.get("share_type")

            if share_type == "Update":

                edit_tweet_id = self.request.get("edit_tweet_id")
                edit_tweet = Definitions().get_tweet(tweet_id=edit_tweet_id)
                edit_tweet.text_share = text_share

                edit_tweet.put()

            else:

                myuser = Definitions().get_login_user()
                tweet = Tweet(text_share=text_share,
                              user_id=myuser.key.id(),
                              user_name=myuser.user_name,
                              time=datetime.datetime.now())
                tweet.put()

                myuser.tweets_id.append(tweet.key.id())
                myuser.put()

        self.redirect("/")
Exemplo n.º 29
0
def readTweets(tweetIDs, folder, label):
    """
    Returns a dictionary containing tweets to the given IDs.
    Reads from csv file (name = label.csv) and each line is one tweet; data is tab-separated. 
    """
    tag_dict = loadTagDictionary()

    with codecs.open(folder + label + ".csv", 'r',
                     encoding='utf-8') as tweetFile:
        tweets = tweetFile.read().split("\n")[:-1]

    tweet_dict = {}

    for tweet in tweets:
        tweet = tweet.strip().split("\t")
        tweetID = tweet[3].strip()
        if tweetID in tweetIDs:
            tweet_dict[tweetID] = tweet

    return {
        tweetID: Tweet(tag_dict[tweetID],
                       rawTweet=tweet_dict[tweetID],
                       label=label)
        for tweetID in tweet_dict.keys()
    }
Exemplo n.º 30
0
def load_tweet_corpus(csv_dir):
    corpus = []

    for file in os.listdir(csv_dir):
        print 'Processing ' + file + '...'
        with open(os.path.join(csv_dir, file), 'rb') as csv_f:
            csv_reader = csv.reader(csv_f,
                                    delimiter='\t',
                                    quotechar='|',
                                    quoting=csv.QUOTE_MINIMAL)
            for row in csv_reader:  #returns a row as a list [id, text, hashtags, timestamp, retweeted?]
                hashtags = row[2].split(',')
                ht_set = set()
                if len(hashtags) == 1 and len(hashtags[0]) == 0:
                    #there are no hashtags in the current tweet
                    pass
                else:
                    for ht in hashtags:
                        if len(ht) == 0 or ht in FORBIDDEN_HASHTAGS:
                            continue
                        ht_set.add(ht)

                corpus.append(
                    Tweet(long(row[0]), row[1].split(), ht_set, long(row[3]),
                          bool(row[4])))

    print 'Sorting...'
    corpus.sort()
    print 'Done.'

    return corpus