Exemplo n.º 1
0
    def update_source(self, twitter_users=twitter_users, max_pages=max_pages):
        """
        Calling update_source will run the Twitter Scraper and update the database with latest 
            Tweets from hard-coded handles.
        If a new handle is included, obtain all Tweets up to max_pages
        Otherwise, update database with new Tweets since last update

        Currently throws an exception if twitter account is invalid or if there are too few tweets
        May need to make the function more robust
        """
        # Create/recall a marker in the database used to identify last update time
        last_update_article = Article.query.filter_by(
            article_author='twitter_last_update').first()
        if last_update_article is None:
            print('did not find last_update')
            last_update = datetime.strptime('Jun 1 2005  1:33PM',
                                            '%b %d %Y %I:%M%p')
            last_update_article = Article(source_type='Social Media',
                                          source_name='Twitter',
                                          article_author='twitter_last_update',
                                          article_publishdate=last_update,
                                          article_wordcount=0,
                                          article_title=None,
                                          article_summary=None,
                                          article_fulltext=None,
                                          article_url=None)
            db.session.add(last_update_article)
            db.session.commit()
        else:
            print('found last_update')
            last_update = last_update_article.article_publishdate

        for user in twitter_users:
            print(user)
            tweets = get_tweets(user, pages=max_pages)
            user_sample_article = Article.query.filter_by(
                article_author=user).first()
            # if twitter user is already in database, update database with only new tweets
            #   by that user
            if user_sample_article is not None:
                for tweet in tweets:
                    if tweet['time'] > last_update:
                        wordcount = len(tweet['text'].split(" "))
                        article = Article(source_type='Social Media',
                                          source_name='Twitter',
                                          article_author=user,
                                          article_publishdate=tweet['time'],
                                          article_wordcount=wordcount,
                                          article_title=None,
                                          article_summary=tweet['text'],
                                          article_fulltext=tweet['text'],
                                          article_url=None)
                        db.session.add(article)
            # if new twitter user to be followed, update database with older tweets as well
            else:
                for tweet in tweets:
                    wordcount = len(tweet['text'].split(" "))
                    article = Article(source_type='Social Media',
                                      source_name='Twitter',
                                      article_author=user,
                                      article_publishdate=tweet['time'],
                                      article_wordcount=wordcount,
                                      article_title=None,
                                      article_summary=tweet['text'],
                                      article_fulltext=tweet['text'],
                                      article_url=None)
                    db.session.add(article)
        last_update = datetime.utcnow()
        last_update_article.article_publishdate = last_update
        db.session.commit()
        print('twitter source updated!')