예제 #1
0
def scrape_user_to_db(username):
    """Scrape a user and insert everything on them into the database. Will overwrite existing data!"""
    with db.get_db() as cursor:

        tweets = []

        # If we've haven't scraped this user before, do a full scrape. If we have, only get the tweets
        # we don't have yet.
        cursor.execute("SELECT * FROM analyzed_users WHERE username=%s",
                       username)
        if cursor.fetchone() is None:
            cursor.execute(
                "INSERT INTO analyzed_users (username, checked) VALUES (%s, NOW())",
                username)
            cursor.connection.commit()
            tweets = query_tweets_from_user(username, limit=5000)
            if len(tweets) == 0:
                return None
        else:
            cursor.execute(
                "SELECT checked FROM analyzed_users WHERE username=%s",
                username)
            d = cursor.fetchone()[0]
            d = d if d is not None else datetime.datetime.utcfromtimestamp(0)

            # If we've already checked this users's tweets within the past day, don't try it again
            if (datetime.datetime.now() - d).days == 0:
                return 0

            cursor.execute(
                "UPDATE analyzed_users SET checked=NOW() WHERE username=%s",
                username)
            tweets = query_tweets_from_user(username, limit=5000)
            tweets = list(filter(lambda tw: d < tw.timestamp, tweets))

        sql = "INSERT INTO tweets (username, content, created, retweets, favorites, replies, is_retweet, id, sentiment) " \
           "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)"
        set_username = False
        for tweet in tweets:
            try:
                # Set the user's full name if it hasn't already been set.
                if not set_username and tweet.user.lower() == username.lower():
                    cursor.execute(
                        "UPDATE analyzed_users SET fullname=%s WHERE username=%s",
                        (tweet.fullname, username))
                    set_username = True

                cursor.execute(sql,
                               (username, tweet.text, tweet.timestamp,
                                tweet.retweets, tweet.likes, tweet.replies,
                                tweet.user.lower() != username.lower(),
                                tweet.id, get_text_sentiment(tweet.text)))
            except pymysql.err.IntegrityError:
                pass
        cursor.connection.commit()
        return len(tweets)
예제 #2
0
def get_profile_tweets(handle, filename):
    profile = query_tweets_from_user(handle, limit=10)
    print('Loading...')
    with open(filename, "w", encoding="utf-8") as output:
        json.dump(profile, output, cls=JSONEncoder)
    profile_dataframe = pd.read_json(filename, encoding='utf-8')

    lstt = []
    for i, val in enumerate(profile_dataframe['links']):
        if str(val) == '[]':
            lstt.append('[]')
        elif str(val[0][0:17]) == 'https://youtu.be/':
            pass
        else:
            summ = summary(profile_dataframe['links'][i][0])
            lstt.append(summ)
        profile_dataframe['summary'] = pd.DataFrame(lstt)

        is_summary = []
        for i, idx in enumerate(profile_dataframe['summary']):
            if len(str(idx)) > 3:
                is_summary.append(1)
            else:
                is_summary.append(0)

        profile_dataframe['is_summary'] = pd.DataFrame(is_summary)

    profile_dataframe.to_csv(filename[:-5] + ".csv")
    print('Loaded')
예제 #3
0
def get_profile_tweets(handle, filename):
    profile = query_tweets_from_user(handle, limit=10)
    print('Loading...')
    with open(filename, "w", encoding="utf-8") as output:
        json.dump(profile, output, cls=JSONEncoder)
    profile_dataframe = pd.read_json('my.json', encoding='utf-8')
    profile_dataframe.to_csv('profile_tweets.csv')
    print('Loaded')
예제 #4
0
def create_data(user_name):
    df = pd.DataFrame(columns=['tweet'])
    a = query_tweets_from_user(user_name, 100)
    for i, tweet in enumerate(a):
        df.loc[i] = tweet.text.replace("\n", ",")
    df.to_csv("scripts/test_dta.csv", index=False, encoding="utf-8")

    return df, "scripts/test_dta.csv"
예제 #5
0
def get_twitter_user_data(username):
    data = get_tweets(username, 150)
    json_content_info = fb.content_info(data)
    profile = fb.personality_data(json_content_info)
    profile_dict = ast.literal_eval(fb.personality_ratings(profile))
    profile_dict['user_id'] = username
    profile_dict['name'] = query_tweets_from_user(username, 5)[0].fullname
    return profile_dict
예제 #6
0
def user_tweets(request, username, limit):
    """
    Returns a list of user's tweets

    param username: str, specifies a user
    optional limit: int, specifies the number of tweets to retrieve, default=30
    """

    tweets = query_tweets_from_user(username, limit)
    data = [format_tweet(tweet) for tweet in tweets[:limit]]
    return JsonResponse(data, safe=False)
예제 #7
0
    def get(self, user):
        args = parser.parse_args()
        pages_limit = args.get('pages_limit', DEFAULT_PAGES_LIMIT)
        list_of_tweets = [
            _transform_to_json(tweet) for tweet in query_tweets_from_user(
                user=user,
                limit=pages_limit
            )[:pages_limit]
        ]

        return [list_of_tweets]
def get_user_info(twitter_user):
    """
    An example of using the query_user_info method
    :param twitter_user: the twitter user to capture user data
    :return: twitter_user_data: returns a dictionary of twitter user data
    """
    user_info = query_user_info(user=twitter_user)
    twitter_user_data = {}
    twitter_user_data["user"] = user_info.user  # feature: screen_name_length
    twitter_user_data["fullname"] = user_info.full_name  # user name
    twitter_user_data["location"] = user_info.location  # feature: location
    twitter_user_data["blog"] = user_info.blog  # feature: url
    twitter_user_data["date_joined"] = user_info.date_joined  # feature: age
    twitter_user_data["id"] = user_info.id  # twitter account id
    twitter_user_data[
        "num_tweets"] = user_info.tweets  # feature: statuses_count
    twitter_user_data[
        "following"] = user_info.following  # feature: friends_count
    twitter_user_data[
        "followers"] = user_info.followers  # feature: followers_count
    twitter_user_data["likes"] = user_info.likes  # feature: favourites_count
    twitter_user_data["lists"] = user_info.lists  # feature: listed_count
    #twitter_user_data["is_verified"] = user_info.is_verified
    twitter_user_data["description"] = user_info.description

    # verified
    # quoted tweets

    max_num_tweets = 10
    latest_tweets = query_tweets_from_user(
        twitter_user, limit=max_num_tweets)  # list of 200 tweet objects
    tweets, tweets_html, hashtags, has_media, num_retweets, num_likes, links, num_replies, reply_to_users, timestamp_epochs, is_quoted_tweet, quoted_user, quoted_text, = get_tweet_attribute(
        latest_tweets)

    twitter_user_data["tweets"] = tweets
    twitter_user_data["tweets_html"] = tweets_html
    twitter_user_data["hashtags"] = hashtags
    twitter_user_data["has_media"] = has_media
    twitter_user_data["num_retweets"] = num_retweets
    twitter_user_data["num_likes"] = num_likes
    twitter_user_data['links'] = links
    twitter_user_data['num_replies'] = num_replies
    twitter_user_data['reply_to_users'] = reply_to_users
    twitter_user_data['timestamp_epochs'] = timestamp_epochs
    twitter_user_data['is_quoted_tweet'] = is_quoted_tweet
    #twitter_user_data['is_retweet'] = is_retweet
    twitter_user_data['quoted_user'] = quoted_user
    twitter_user_data['quoted_text'] = quoted_text
    #twitter_user_data['retweet_user'] = retweet_user
    #twitter_user_data['retweet_text'] = retweet_text

    return twitter_user_data
예제 #9
0
def queringTweets(username):
    filename = "{}.json".format(username)
    filename1 = "{}.txt".format(username)
    tweets = query_tweets_from_user(username)
    f = open(username + ".txt", "a")
    j = []
    for t in tweets:
        t.timestamp = t.timestamp.isoformat()
        f.write(" Tweet ID:{} Tarih:{}: {} \n".format(t.tweet_id, t.timestamp,
                                                      t.text))
        #j.append(t.__dict__)
    f.close()
    """with open(filename, "w") as f:
예제 #10
0
 def _download_tweeted_and_retweeted(self):
     print('Downloading tweeted and retweeted...')
     for tweet in query_tweets_from_user(self.username):
         if tweet.user != self.username:
             occurences = self.num_retweets_by_originator.get(tweet.user, 0)
             self.num_retweets_by_originator[tweet.user] = occurences + 1
             self.retweeted[tweet.id] = self._serialize_tweet(tweet)
         else:
             self.tweeted[tweet.id] = self._serialize_tweet(tweet)
     self.users_retweeted = list(
         map(
             lambda item: item[0],
             sorted(self.num_retweets_by_originator.items(),
                    key=lambda item: item[1],
                    reverse=True)))
예제 #11
0
def get_tweets(user):
    """
    Scrape twitter to get tweets from user.
    """

    # Currently pulls 200. Can adjust number.
    list_of_tweets = query_tweets_from_user(user, 200)

    tweets = []

    # filter out retweets and direct quotations from other people
    for tweet in list_of_tweets:
        if tweet.user == user and "\u201c" not in tweet.text:
            # replace slanted apostrophes with normal ones
            tweets.append(re.sub(u"([‘’])", "'", tweet.text))

    return tweets
예제 #12
0
def main():
    '''
    start = time.time()
    users = ['realDonaldTrump']

    pool = Pool(8)    
    for user in pool.map(get_user_info,users):
        twitter_user_info.append(user)

    cols=['id','fullname','date_joined','location','blog', 'num_tweets','following','followers','likes','lists']
    data_frame = pd.DataFrame(twitter_user_info, index=users, columns=cols)
    data_frame.index.name = "Users"
    data_frame.sort_values(by="followers", ascending=False, inplace=True, kind='quicksort', na_position='last')
    elapsed = time.time() - start
    print("Elapsed time: ")
    print(elapsed)
    display(data_frame)
'''
    with open("data.json", "w") as f:
        f.write("Current time {} START!!!\n".format(datetime.now().ctime()))
    users = [
        'realDonaldTrump', 'RobinBew', 'TheEIU', 'TheEconomist', 'seanmdav',
        'erm3114', 'AgatheDemarais', 'john_c_ferguson', 'maxlambertson',
        'davidfrum', 'Lagarde', 'RobertAlanWard'
    ]
    json_object_array = []
    data = {}
    tweet_count_old = 0
    for user in users:
        for tweet in query_tweets_from_user(user, limit=10):
            data['screen_name'] = tweet.screen_name.encode('utf-8')
            data['timestamp'] = tweet.timestamp.ctime()
            data['text'] = tweet.text.encode('utf-8')
            json_dump = json.dumps(data)
            json_object_array.append(json.loads(json_dump))
        with open("data.json", "a") as f:
            f.write("Got {} tweets from username {}\n".format(
                len(json_object_array) - tweet_count_old, user))
        tweet_count_old = len(json_object_array)
    with open('data.json', 'a') as f:
        json.dump(json_object_array, f, indent=2)
예제 #13
0
def download_user(user_id: str):
    work_dir = '../build/outputs'
    user_dir = work_dir + '/' + user_id
    os.makedirs(user_dir, exist_ok=True)

    tweet: Tweet
    tweets = query_tweets_from_user(user_id)
    with open(user_dir + "/tweets.json", "w", encoding="utf-8") as output:
        json.dump(tweets, output, ensure_ascii=False, cls=JSONEncoder)

    videos = []
    pool_size = 16
    pool = ThreadPool(pool_size)
    pool.map(partial(download_tw, user_dir=user_dir), tweets)

    for tweet in tweets:
        if tweet.video_url:
            videos.append(tweet.video_url + "\n")

    with open(user_dir + "/videos.txt", "a", encoding="utf-8") as video:
        video.writelines(videos)
예제 #14
0
def main():
    try:
        parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
            description=__doc__
        )

        parser.add_argument("query", type=str, help="Advanced twitter query")
        parser.add_argument("-o", "--output", type=str, default="tweets.json",
                            help="Path to a JSON file to store the gathered "
                                 "tweets to.")
        parser.add_argument("-l", "--limit", type=int, default=None,
                            help="Number of minimum tweets to gather.")
        parser.add_argument("-a", "--all", action='store_true',
                            help="Set this flag if you want to get all tweets "
                                 "in the history of twitter. Begindate is set to 2006-03-01."
                                 "This may take a while. You can increase the number of parallel"
                                 "processes depending on the computational power you have.")
        parser.add_argument("-c", "--csv", action='store_true',
                                help="Set this flag if you want to save the results to a CSV format.")
        parser.add_argument("-u", "--user", action='store_true',
                            help="Set this flag to if you want to scrape tweets from a specific user"
                                 "The query should then consist of the profilename you want to scrape without @")
        parser.add_argument("--lang", type=str, default=None,
                            help="Set this flag if you want to query tweets in \na specific language. You can choose from:\n"
                                 "en (English)\nar (Arabic)\nbn (Bengali)\n"
                                 "cs (Czech)\nda (Danish)\nde (German)\nel (Greek)\nes (Spanish)\n"
                                 "fa (Persian)\nfi (Finnish)\nfil (Filipino)\nfr (French)\n"
                                 "he (Hebrew)\nhi (Hindi)\nhu (Hungarian)\n"
                                 "id (Indonesian)\nit (Italian)\nja (Japanese)\n"
                                 "ko (Korean)\nmsa (Malay)\nnl (Dutch)\n"
                                 "no (Norwegian)\npl (Polish)\npt (Portuguese)\n"
                                 "ro (Romanian)\nru (Russian)\nsv (Swedish)\n"
                                 "th (Thai)\ntr (Turkish)\nuk (Ukranian)\n"
                                 "ur (Urdu)\nvi (Vietnamese)\n"
                                 "zh-cn (Chinese Simplified)\n"
                                 "zh-tw (Chinese Traditional)"
                                 )
        parser.add_argument("-d", "--dump", action="store_true",
                            help="Set this flag if you want to dump the tweets \nto the console rather than outputting to a file")
        parser.add_argument("-bd", "--begindate", type=valid_date, default="2006-03-21",
                            help="Scrape for tweets starting from this date. Format YYYY-MM-DD. \nDefault value is 2006-03-21", metavar='\b')
        parser.add_argument("-ed", "--enddate", type=valid_date, default=dt.date.today(),
                            help="Scrape for tweets until this date. Format YYYY-MM-DD. \nDefault value is the date of today.", metavar='\b')
        parser.add_argument("-p", "--poolsize", type=int, default=20, help="Specify the number of parallel process you want to run. \n"
                            "Default value is set to 20. \nYou can change this number if you have more computing power available. \n"
                            "Set to 1 if you dont want to run any parallel processes.", metavar='\b')
        args = parser.parse_args()

        if isfile(args.output) and not args.dump:
            logger.error("Output file already exists! Aborting.")
            exit(-1)

        if args.all:
            args.begindate = dt.date(2006,3,1)

        if args.user:
            tweets = query_tweets_from_user(user = args.query, limit = args.limit)
        else:
            tweets = query_tweets(query = args.query, limit = args.limit,
                              begindate = args.begindate, enddate = args.enddate,
                              poolsize = args.poolsize, lang = args.lang)

        if args.dump:
            print(json.dumps(tweets, cls=JSONEncoder))
        else:
            if tweets:
                with open(args.output, "w", encoding="utf-8") as output:
                    if args.csv:
                        f = csv.writer(output)
                        f.writerow(["user", "fullname", "tweet-id", "timestamp", "url", "likes", "replies", "retweets", "text", "html"])
                        for x in tweets:
                            f.writerow([x.user, x.fullname, x.id, x.timestamp, x.url,
                                        x.likes, x.replies, x.retweets,
                                        x.text, x.html])
                    else:
                        json.dump(tweets, output, cls=JSONEncoder)
    except KeyboardInterrupt:
        logger.info("Program interrupted by user. Quitting...")
 def get_user_tweets(self, username, limit):
     tweets = query_tweets_from_user(username, limit=limit)
     tweets.reverse()
     return tweet.Tweet.create_from_scraper_response(tweets)
예제 #16
0
def main():
    try:
        parser = argparse.ArgumentParser(
            formatter_class=argparse.RawTextHelpFormatter, description=__doc__)

        parser.add_argument("query", type=str, help="Advanced twitter query")
        parser.add_argument("-o",
                            "--output",
                            type=str,
                            default="tweets.json",
                            help="Path to a JSON file to store the gathered "
                            "tweets to.")
        parser.add_argument("-l",
                            "--limit",
                            type=int,
                            default=None,
                            help="Number of minimum tweets to gather.")
        parser.add_argument(
            "-a",
            "--all",
            action='store_true',
            help="Set this flag if you want to get all tweets "
            "in the history of twitter. Begindate is set to 2006-03-01."
            "This may take a while. You can increase the number of parallel"
            "processes depending on the computational power you have.")
        parser.add_argument(
            "-c",
            "--csv",
            action='store_true',
            help=
            "Set this flag if you want to save the results to a CSV format.")
        parser.add_argument(
            "-u",
            "--user",
            action='store_true',
            help=
            "Set this flag to if you want to scrape tweets from a specific user"
            "The query should then consist of the profilename you want to scrape without @"
        )
        parser.add_argument(
            "--profiles",
            action='store_true',
            help=
            "Set this flag to if you want to scrape profile info of all the users where you"
            "have previously scraped from. After all of the tweets have been scraped it will start"
            "a new process of scraping profile pages.")
        parser.add_argument(
            "--lang",
            type=str,
            default=None,
            help=
            "Set this flag if you want to query tweets in \na specific language. You can choose from:\n"
            "en (English)\nar (Arabic)\nbn (Bengali)\n"
            "cs (Czech)\nda (Danish)\nde (German)\nel (Greek)\nes (Spanish)\n"
            "fa (Persian)\nfi (Finnish)\nfil (Filipino)\nfr (French)\n"
            "he (Hebrew)\nhi (Hindi)\nhu (Hungarian)\n"
            "id (Indonesian)\nit (Italian)\nja (Japanese)\n"
            "ko (Korean)\nmsa (Malay)\nnl (Dutch)\n"
            "no (Norwegian)\npl (Polish)\npt (Portuguese)\n"
            "ro (Romanian)\nru (Russian)\nsv (Swedish)\n"
            "th (Thai)\ntr (Turkish)\nuk (Ukranian)\n"
            "ur (Urdu)\nvi (Vietnamese)\n"
            "zh-cn (Chinese Simplified)\n"
            "zh-tw (Chinese Traditional)")
        parser.add_argument(
            "-d",
            "--dump",
            action="store_true",
            help=
            "Set this flag if you want to dump the tweets \nto the console rather than outputting to a file"
        )
        parser.add_argument(
            "-ow",
            "--overwrite",
            action="store_true",
            help=
            "Set this flag if you want to overwrite the existing output file.")
        parser.add_argument(
            "-bd",
            "--begindate",
            type=valid_date,
            default="2006-03-21",
            help=
            "Scrape for tweets starting from this date. Format YYYY-MM-DD. \nDefault value is 2006-03-21",
            metavar='\b')
        parser.add_argument(
            "-ed",
            "--enddate",
            type=valid_date,
            default=dt.date.today(),
            help=
            "Scrape for tweets until this date. Format YYYY-MM-DD. \nDefault value is the date of today.",
            metavar='\b')
        parser.add_argument(
            "-p",
            "--poolsize",
            type=int,
            default=20,
            help="Specify the number of parallel process you want to run. \n"
            "Default value is set to 20. \nYou can change this number if you have more computing power available. \n"
            "Set to 1 if you dont want to run any parallel processes.",
            metavar='\b')
        parser.add_argument(
            "--loglevel",
            type=valid_loglevel,
            default=logging.INFO,
            help="Specify the level for logging. \n"
            "Must be a valid value from https://docs.python.org/2/library/logging.html#logging-levels. \n"
            "Default log level is set to INFO.")
        parser.add_argument(
            "-dp",
            "--disableproxy",
            action="store_true",
            default=False,
            help=
            "Set this flag if you want to disable use of proxy servers when scrapping tweets and user profiles. \n"
        )
        args = parser.parse_args()

        logging.basicConfig()
        logger.setLevel(args.loglevel)

        if isfile(args.output) and not args.dump and not args.overwrite:
            logger.error("Output file already exists! Aborting.")
            exit(-1)

        if args.all:
            args.begindate = dt.date(2006, 3, 1)

        if args.user:
            tweets = query_tweets_from_user(user=args.query,
                                            limit=args.limit,
                                            use_proxy=not args.disableproxy)
        else:
            tweets = query_tweets(query=args.query,
                                  limit=args.limit,
                                  begindate=args.begindate,
                                  enddate=args.enddate,
                                  poolsize=args.poolsize,
                                  lang=args.lang,
                                  use_proxy=not args.disableproxy)

        if args.dump:
            pprint([tweet.__dict__ for tweet in tweets])
        else:
            if tweets:
                with open(args.output, "w", encoding="utf-8") as output:
                    if args.csv:
                        f = csv.writer(output,
                                       delimiter=";",
                                       quoting=csv.QUOTE_NONNUMERIC)
                        f.writerow([
                            "screen_name", "username", "user_id", "tweet_id",
                            "tweet_url", "timestamp", "timestamp_epochs",
                            "text", "text_html", "links", "hashtags",
                            "has_media", "img_urls", "video_url", "likes",
                            "retweets", "replies", "is_replied", "is_reply_to",
                            "parent_tweet_id", "reply_to_users"
                        ])
                        for t in tweets:
                            f.writerow([
                                t.screen_name, t.username, t.user_id,
                                t.tweet_id, t.tweet_url, t.timestamp,
                                t.timestamp_epochs, t.text, t.text_html,
                                t.links, t.hashtags, t.has_media, t.img_urls,
                                t.video_url, t.likes, t.retweets, t.replies,
                                t.is_replied, t.is_reply_to, t.parent_tweet_id,
                                t.reply_to_users
                            ])
                    else:
                        json.dump(tweets, output, cls=JSONEncoder)
            if args.profiles and tweets:
                list_users = list(set([tweet.username for tweet in tweets]))
                list_users_info = [
                    query_user_info(elem, not args.disableproxy)
                    for elem in list_users
                ]
                filename = 'userprofiles_' + args.output
                with open(filename, "w", encoding="utf-8") as output:
                    json.dump(list_users_info, output, cls=JSONEncoder)
    except KeyboardInterrupt:
        logger.info("Program interrupted by user. Quitting...")
예제 #17
0
def get_tweets(username, limit):
    """Given a Twitter username, return a list of their most recent tweets."""
    tweets = []
    for tweet in query_tweets_from_user(username, limit):
        tweets.append(tweet.text)
    return tweets
예제 #18
0
def get_user_tweets():
    user_tweets_list = query.query_tweets_from_user(user=q, limit=tweets_num)
    return user_tweets_list
예제 #19
0
def main():
    try:
        parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
            description=__doc__
        )

        parser.add_argument("query", type=str, help="Advanced twitter query")
        parser.add_argument("-o", "--output", type=str, default="tweets.json",
                            help="Path to a JSON file to store the gathered "
                                 "tweets to.")
        parser.add_argument("-l", "--limit", type=int, default=None,
                            help="Number of minimum tweets to gather.")
        parser.add_argument("-a", "--all", action='store_true',
                            help="Set this flag if you want to get all tweets "
                                 "in the history of twitter. Begindate is set to 2006-03-01."
                                 "This may take a while. You can increase the number of parallel"
                                 "processes depending on the computational power you have.")
        parser.add_argument("-c", "--csv", action='store_true',
                                help="Set this flag if you want to save the results to a CSV format.")
        parser.add_argument("-u", "--user", action='store_true',
                            help="Set this flag to if you want to scrape tweets from a specific user"
                                 "The query should then consist of the profilename (user) you want to scrape without @")
        parser.add_argument("--profiles", action='store_true',
                            help="Set this flag to if you want to scrape profile info of all the users where you" 
                            "have previously scraped from. After all of the tweets have been scraped it will start"
                            "a new process of scraping profile pages.")
        parser.add_argument("--lang", type=str, default=None,
                            help="Set this flag if you want to query tweets in \na specific language. You can choose from:\n"
                                 "en (English)\nar (Arabic)\nbn (Bengali)\n"
                                 "cs (Czech)\nda (Danish)\nde (German)\nel (Greek)\nes (Spanish)\n"
                                 "fa (Persian)\nfi (Finnish)\nfil (Filipino)\nfr (French)\n"
                                 "he (Hebrew)\nhi (Hindi)\nhu (Hungarian)\n"
                                 "id (Indonesian)\nit (Italian)\nja (Japanese)\n"
                                 "ko (Korean)\nmsa (Malay)\nnl (Dutch)\n"
                                 "no (Norwegian)\npl (Polish)\npt (Portuguese)\n"
                                 "ro (Romanian)\nru (Russian)\nsv (Swedish)\n"
                                 "th (Thai)\ntr (Turkish)\nuk (Ukranian)\n"
                                 "ur (Urdu)\nvi (Vietnamese)\n"
                                 "zh-cn (Chinese Simplified)\n"
                                 "zh-tw (Chinese Traditional)"
                                 )
        parser.add_argument("-d", "--dump", action="store_true",
                            help="Set this flag if you want to dump the tweets \nto the console rather than outputting to a file")
        parser.add_argument("-bd", "--begindate", type=valid_date, default="2006-03-21",
                            help="Scrape for tweets starting from this date. Format YYYY-MM-DD. \nDefault value is 2006-03-21", metavar='\b')
        parser.add_argument("-ed", "--enddate", type=valid_date, default=dt.date.today(),
                            help="Scrape for tweets until this date. Format YYYY-MM-DD. \nDefault value is the date of today.", metavar='\b')
        parser.add_argument("-p", "--poolsize", type=int, default=20, help="Specify the number of parallel process you want to run. \n"
                            "Default value is set to 20. \nYou can change this number if you have more computing power available. \n"
                            "Set to 1 if you dont want to run any parallel processes.", metavar='\b')
        args = parser.parse_args()

        if isfile(args.output) and not args.dump:
            logger.error("Output file already exists! Aborting.")
            exit(-1)

        if args.all:
            args.begindate = dt.date(2006,3,1)
            args.enddate = dt.date.today()

        if args.user:
            tweets = query_tweets_from_user(user = args.query, limit = args.limit)
        else:
            tweets = query_tweets(query = args.query, limit = args.limit,
                              begindate = args.begindate, enddate = args.enddate,
                              poolsize = args.poolsize, lang = args.lang)

        if args.dump:
            print(json.dumps(tweets, cls=JSONEncoder))
        else:
            if tweets:
                with open(args.output, "w", encoding="utf-8") as output:
                    if args.csv:
                        f = csv.writer(output)
                        f.writerow(["user", "fullname", "tweet-id", "timestamp", "url", "likes", "replies", "retweets", "text", "html"])
                        for x in tweets:
                            f.writerow([x.user, x.fullname, x.id, x.timestamp, x.url,
                                        x.likes, x.replies, x.retweets,
                                        x.text, x.html])
                    else:
                        json.dump(tweets, output, cls=JSONEncoder)

            if args.profiles and tweets:
                list_users = list(set([tweet.user for tweet in tweets]))
                # list_users_info = [query_user_info(elem) for elem in list_users]
                filename = 'userprofiles_' + args.output

                with open(filename, "w", encoding="utf-8") as output:
                    if args.csv:
                        f = csv.writer(output)
                        f.writerow(["user","fullname","location","blog","date_joined","id","num_tweets","following","followers","likes","lists"])
                        for elem in list_users:
                            u = query_user_info(elem)
                            if u is None:
                                continue
                            else:
                                f.writerow([u.user, u.full_name, u.location, u.blog, u.date_joined, u.id, u.tweets, u.following,
                                u.followers, u.likes, u.lists])

                    else:
                        for elem in list_users:
                            u = query_user_info(elem)
                            if u is None:
                                continue
                            else:
                                json.dump(u, output, cls=JSONEncoder, indent=2)

    except KeyboardInterrupt:
        logger.info("Program interrupted by user. Quitting...")
예제 #20
0
def get_tweet():
    tweet_list = query_tweets_from_user(target_account_id, page_limit)
    for tweet in tweet_list:
        analysis_tweet(tweet)
예제 #21
0
def get_user_tweets(user, limit):
    return query_tweets_from_user(user, limit=limit)
예제 #22
0
import pandas as pd
import numpy as np
import os
from twitterscraper import query_tweets
from twitterscraper.query import query_tweets_from_user

username = "******"
c.Output = r"data_scraping/data_retweet/test_csv.csv"
try:
    os.remove("data_scraping/data_retweet/test_csv.csv")
except:
    pass

# CSV Fieldnames
list_of_tweets = query_tweets_from_user(username, limit=10)

#print the retrieved tweets to the screen:
for tweet in list_of_tweets:
    print(tweet.from_soup())

#Or save the retrieved tweets to file:
# with open("data_scraping/data_retweet/test.txt",'w') as file:
#     for tweet in query_tweets("Trump OR Clinton", 10):
#         file.write(tweet.encode('utf-8'))
# file.close()

if __name__ == '__main__':
    # recieve list of participant id in seed group
    pro_colname = "Pro-Israeli sources on Twitter"
    anti_colname = "Anti Israeli sources on Twitter"
    seed_list = pd.read_csv(r"data_scraping/twitter_seeds.csv", header=0)