Пример #1
0
def extract_and_store_tweets(csvfile, nlp, minetweet):
    print
    print "Start processing %s ..." % csvfile
    print "*" * 20

    start = time()  # measure time

    # LOGGING
    tweets_count = 0
    mentions_count = 0
    urls_count = 0
    hashtags_count = 0
    tags_count = 0
    unvalid_tweets = 0

    i = 1  # iteroator to remember row number on csv
    with open(csvfile, 'r') as f:

        # print 'Processing data...'
        next(f)  # skip csv header
        data = csv.reader(f)

        # one row at a time
        for row in data:

            # create Tweet object
            t = Tweet()

            # Populate Tweet
            t.mid = row[0]
            t.retweetFromPostId = row[1]
            t.userId = row[2]
            t.retweetFromUserId = row[3]
            t.source = row[4]
            t.hasImage = row[5]
            t.txt = row[6]
            t.geo = row[7]
            t.created_at = row[8]
            t.deleted_last_seen = row[9]
            t.permission_denied = row[10]

            # Extract tweet entities
            mentions, urls, hashtags, clean = minetweet.extract_tweet_entities(
                t.txt)

            # add to Tweet
            t.mentions = mentions
            t.urls = urls
            t.hashtags = hashtags
            clean = clean  # text-only version of the tweet for NLP

            # Extract keywords
            dico = nlp.extract_dictionary(clean)

            # remove stopwords and store clean dico
            t.dico = nlp.remove_stopwords(dico)

            # extract entities
            # TODO : ignore stopwords
            # t.entities=nlp.extract_named_entities_from_dico(t.dico)

            # Some count for stats
            mentions_count += len(mentions)
            urls_count += len(urls)
            hashtags_count += len(hashtags)
            tags_count += len(t.entities)

            t.row = i

            valid_utf8 = True
            try:
                t.txt.decode('utf-8')
            except UnicodeDecodeError:
                unvalid_tweets += 1
                valid_utf8 = False
                print ' bad encoding : tweet ', t.mid
                # pprint(t)

            if valid_utf8 is True:
                try:
                    t.save()
                    tweets_count += 1
                except bson.errors.InvalidStringData:
                    print ' bad encoding : tweet ', t.mid
                    # pprint(t)

    # LOG
    print
    print "-" * 10
    print " mentions_count            : %d " % mentions_count
    print " urls_count                : %d " % urls_count
    print " hashtags_count            : %d " % hashtags_count
    print " unvalid tweets            : %d " % unvalid_tweets
    print " TOTAL tweet entities      : %d " % (mentions_count + urls_count +
                                                hashtags_count)
    print " TOTAL named entities (NER): %d " % tags_count
    print
    print "-" * 10
    print "TOTAL tweets processed    : %d" % tweets_count
    print " done in %.3fs" % (time() - start)
    print
Пример #2
0
def extract_and_store_tweets(csvfile,nlp,minetweet):
    print
    print "Start processing %s ..."%csvfile
    print "*"*20

    start=time() # measure time

    # LOGGING
    tweets_count=0
    mentions_count=0
    urls_count=0
    hashtags_count=0
    tags_count=0
    unvalid_tweets=0


    i=1 # iteroator to remember row number on csv
    with open(csvfile, 'r') as f:

        # print 'Processing data...'
        next(f) # skip csv header
        data = csv.reader(f)

        # one row at a time
        for row in data: 

            # create Tweet object
            t=Tweet()

            # Populate Tweet
            t.mid=row[0]
            t.retweetFromPostId=row[1]
            t.userId=row[2]
            t.retweetFromUserId=row[3]
            t.source=row[4]
            t.hasImage=row[5]
            t.txt=row[6]
            t.geo=row[7]
            t.created_at=row[8]
            t.deleted_last_seen=row[9]
            t.permission_denied=row[10]

            # Extract tweet entities
            mentions,urls,hashtags,clean=minetweet.extract_tweet_entities(t.txt)
            
            # add to Tweet
            t.mentions=mentions
            t.urls=urls
            t.hashtags=hashtags
            clean=clean # text-only version of the tweet for NLP

            # Extract keywords
            dico=nlp.extract_dictionary(clean)

            # remove stopwords and store clean dico
            t.dico=nlp.remove_stopwords(dico)

            # extract entities
            # TODO : ignore stopwords
            # t.entities=nlp.extract_named_entities_from_dico(t.dico)
            
            # Some count for stats
            mentions_count+=len(mentions)
            urls_count+=len(urls)
            hashtags_count+=len(hashtags)
            tags_count+=len(t.entities)

            t.row=i

            valid_utf8 = True
            try:
                t.txt.decode('utf-8')
            except UnicodeDecodeError:
                unvalid_tweets+=1
                valid_utf8 = False
                print ' bad encoding : tweet ',t.mid
                # pprint(t)
            
            if valid_utf8 is True:
                try:
                    t.save()
                    tweets_count+=1
                except bson.errors.InvalidStringData:
                    print ' bad encoding : tweet ',t.mid
                    # pprint(t)

    # LOG
    print
    print "-"*10
    print " mentions_count            : %d "%mentions_count
    print " urls_count                : %d "%urls_count
    print " hashtags_count            : %d "%hashtags_count
    print " unvalid tweets            : %d "%unvalid_tweets
    print " TOTAL tweet entities      : %d "%(mentions_count+urls_count+hashtags_count)
    print " TOTAL named entities (NER): %d "%tags_count
    print
    print "-"*10
    print "TOTAL tweets processed    : %d"%tweets_count
    print " done in %.3fs"%(time()-start)
    print
Пример #3
0
def save_tweet(obj):
    """
    Saves each tweet to the database
    :param obj:
    """
    tweet_id = obj['id_str']
    if not tweet_id in tweets_map:

        # dive into recursion until we hit the original tweet
        parent_tweet = None
        if 'retweeted_status' in obj and obj['retweeted_status'] is not None:
            parent_tweet = save_tweet(obj['retweeted_status'])

        location = None
        if obj['coordinates'] and obj['coordinates']['coordinates']:
            location = WKTElement(
                f"POINT({obj['coordinates']['coordinates'][0]} {obj['coordinates']['coordinates'][1]})",
                srid=4326)

        tweet = Tweet(id=obj['id_str'],
                      content=obj['full_text'],
                      location=location,
                      retweet_count=obj['retweet_count'],
                      favorite_count=obj['favorite_count'],
                      happened_at=obj['created_at'])

        tweets_map[tweet_id] = True

        # if user is present in tweet
        if obj['user'] is not None:

            # if user is not previously added in hashmap of accounts create new user
            user_id = obj['user']['id']
            if not user_id in accounts_map:
                account = Account(
                    id=obj['user']['id'],
                    screen_name=obj['user']['screen_name'],
                    name=obj['user']['name'],
                    description=obj['user']['description'],
                    followers_count=obj['user']['followers_count'],
                    friends_count=obj['user']['friends_count'],
                    statuses_count=obj['user']['statuses_count'])
                accounts_map[account.id] = SavedAccountType.FULL
            else:
                # find user in database
                account = session.query(Account).filter(
                    Account.id == user_id).scalar()

                # user was previously saved as user_mention and needs to be updated with new attributes which are not present in user_mentions
                if accounts_map[user_id] == SavedAccountType.MENTION:
                    account.update(obj['user'])
                    accounts_map[account.id] = SavedAccountType.FULL

            # add user as an author of the tweet
            tweet.author = account

        # user mentions
        if (obj['entities'] is not None
                and obj['entities']['user_mentions'] is not None
                and len(obj['entities']['user_mentions'])):
            mentions = []

            # map all mentions
            for mentioned_user in obj['entities']['user_mentions']:

                user_id = mentioned_user['id']

                # if user is mentioned in the status multiple times (not saved to the db yet, already in accounts hashmap)
                # or the user mentions himself before being saved to db
                # (they're saved at the end of save_tweet function along the tweet itself)

                if (user_id in map(lambda x: x.id, mentions)
                        or user_id == tweet.author.id):
                    continue

                # check whether the mention wasn't previously saved in hashmap
                if not user_id in accounts_map:
                    account = Account(
                        id=mentioned_user['id'],
                        screen_name=mentioned_user['screen_name'],
                        name=mentioned_user['user']['name'],
                    )
                    accounts_map[user_id] = SavedAccountType.MENTION
                else:
                    # find user in database
                    account = session.query(Account).filter(
                        Account.id == user_id).scalar()

                # append to the array of mentions
                mentions.append(account)

            # associate hashtags array with tweet
            tweet.mentions = mentions

        # if place is present in tweet and has all fields present
        if (obj['place'] is not None and obj['place']['country_code']
                and obj['place']['country']):
            # if place is not previously added in hashmap of countries create a new country
            country_code = obj['place']['country_code']
            if not country_code in countries_map:
                country = Country(code=obj['place']['country_code'],
                                  name=obj['place']['country'])
                countries_map[country.code] = True
            else:
                # find country in database
                country = session.query(Country).filter(
                    Country.code == country_code).scalar()

            # add place as an country of the tweet
            tweet.country = country

        if (obj['entities'] is not None
                and obj['entities']['hashtags'] is not None
                and len(obj['entities']['hashtags'])):
            hashtags = []

            # map all hashtags
            for hashtag_obj in obj['entities']['hashtags']:

                # check whether the hashtag wasn't previously saved
                hashtag_id = hashtag_obj['text']

                # hashtag_id in hashtags of current tweet, not saved to the db yet, already in hashtag hashmap
                if hashtag_id in map(lambda x: x.value, hashtags):
                    continue

                if not hashtag_id in hashtags_map:
                    hashtags_map[hashtag_id] = True
                    hashtag = Hashtag(hashtag_obj['text'])
                else:
                    # find hashtag in database
                    hashtag = session.query(Hashtag).filter(
                        Hashtag.value == hashtag_id).scalar()

                # append to the array of hashtags
                hashtags.append(hashtag)

            # associate hashtags array with tweet
            tweet.hashtags = hashtags

        # set the parent tweet from the recursion
        if parent_tweet:
            tweet.parent = parent_tweet

        # save tweet object into the db
        session.add(tweet)

        return tweet