def test_scrap_tweet_with_double_media(): tweets_ids = ['1115978039534297088'] collector = st.CollectorTweetOutput() st.TweetsByIdsRunner(st.TweetsByIdsTask(tweets_ids), [collector]).run() tweets = collector.get_scrapped_tweets() assert len(tweets) == 1 assert len(tweets[0].media) == 2
def test_scrap_tweet_with_single_media(): tweets_ids = ['1357358278746005508'] collector = st.CollectorTweetOutput() st.TweetsByIdsRunner(st.TweetsByIdsTask(tweets_ids), [collector]).run() tweets = collector.get_scrapped_tweets() assert len(tweets) == 1 assert len(tweets[0].media) == 1
def test_csv_serialization(): csv_filename = get_temp_test_file_name('csv') tweets_collector = st.CollectorTweetOutput() get_tweets_to_tweet_output_test( [st.CsvTweetOutput(csv_filename), tweets_collector]) tweets_from_csv = st.read_tweets_from_csv_file(csv_filename) two_lists_assert_equal(tweets_from_csv, tweets_collector.get_raw_list())
def test_file_json_lines_serialization(): jl_filename = get_temp_test_file_name('jl') tweets_collector = st.CollectorTweetOutput() get_tweets_to_tweet_output_test( [st.JsonLineFileTweetOutput(jl_filename), tweets_collector]) tweets_from_jl = st.read_tweets_from_json_lines_file(jl_filename) two_lists_assert_equal(tweets_from_jl, tweets_collector.get_raw_list())
def _scrap_tweets_with_count_assert(count: int): phrase = '#covid19' search_tweets_task = st.SearchTweetsTask(all_words=phrase, tweets_limit=count) tweets_collector = st.CollectorTweetOutput() st.TweetSearchRunner(search_tweets_task=search_tweets_task, tweet_outputs=[tweets_collector]).run() assert len(tweets_collector.get_scrapped_tweets()) == count
def test_print_all_tweet_output(): captured_output = StringIO() sys.stdout = captured_output tweets_collector = st.CollectorTweetOutput() get_tweets_to_tweet_output_test([st.PrintTweetOutput(), tweets_collector]) sys.stdout = sys.__stdout__ assert captured_output.getvalue().count('Tweet(') == len( tweets_collector.get_raw_list())
def test_return_tweets_from_user(): username = '******' search_tweets_task = st.SearchTweetsTask(from_username=username, tweets_limit=100) tweets_collector = st.CollectorTweetOutput() st.TweetSearchRunner(search_tweets_task=search_tweets_task, tweet_outputs=[tweets_collector]).run() tweet_list_assert_condition(tweets_collector.get_scrapped_tweets(), lambda tweet: tweet.user_name == username)
def test_exact_words(): exact_phrase = 'duda kaczyński kempa' search_tweets_task = st.SearchTweetsTask(exact_words=exact_phrase) tweets_collector = st.CollectorTweetOutput() st.TweetSearchRunner(search_tweets_task=search_tweets_task, tweet_outputs=[tweets_collector]).run() tweet_list_assert_condition( tweets_collector.get_raw_list(), lambda tweet: to_base_text( exact_phrase) in to_base_text(tweet.full_text))
def test_search_to_username(): username = '******' search_tweets_task = st.SearchTweetsTask(to_username=username, tweets_limit=100) tweets_collector = st.CollectorTweetOutput() st.TweetSearchRunner(search_tweets_task=search_tweets_task, tweet_outputs=[tweets_collector]).run() tweet_list_assert_condition( tweets_collector.get_scrapped_tweets(), lambda tweet: to_base_text(username) in to_base_text(tweet.full_text))
def _run_search_test_covid_tweets_in_language(language: st.Language): search_tweets_task = st.SearchTweetsTask(all_words='#covid19', tweets_limit=100, language=language) tweets_collector = st.CollectorTweetOutput() st.TweetSearchRunner(search_tweets_task=search_tweets_task, tweet_outputs=[tweets_collector]).run() tweet_list_assert_condition( tweets_collector.get_scrapped_tweets(), lambda tweet: tweet.lang == language.short_value)
def _run_test_between_dates(since: Arrow, until: Arrow): search_tweets_task = st.SearchTweetsTask(any_word="#koronawirus #covid19", since=since, until=until) tweets_collector = st.CollectorTweetOutput() st.TweetSearchRunner(search_tweets_task=search_tweets_task, tweet_outputs=[tweets_collector]).run() tweet_list_assert_condition( tweets_collector.get_scrapped_tweets(), lambda tweet: since <= tweet.created_at <= until)
def test_get_not_existing_tweet(): tweets_ids = ['1337071849772093442'] task = st.TweetsByIdTask(tweets_ids) collect_output = st.CollectorTweetOutput() result = st.TweetsByIdRunner( task, [collect_output], web_client=CustomAdapter([('https://cdn.syndication.twimg.com/tweet', RequestResponse(404, ''))])).run() assert result.downloaded_count == 0 assert len(result.tweet_ids_not_scrapped) == 1
def run_test_for_single_language(language: st.Language): search_tweets_task = st.SearchTweetsTask(all_words='#covid19', tweets_limit=10, language=language) tweets_collector = st.CollectorTweetOutput() st.TweetSearchRunner(search_tweets_task=search_tweets_task, tweet_outputs=[tweets_collector]).run() tweet_list_assert_condition( tweets_collector.get_raw_list(), lambda tweet: tweet.lang in language.short_value)
def test_get_tweets_by_ids(): tweets_ids = ['1337071849772093442', '1337067073051238400'] task = st.TweetsByIdsTask(tweets_ids) collect_output = st.CollectorTweetOutput() result = st.TweetsByIdsRunner(task, [collect_output]).run() scrapped_tweets_ids = [ it.id_str for it in collect_output.get_scrapped_tweets() ] assert result.downloaded_count == 1 assert len(collect_output.get_scrapped_tweets()) == 1 assert len(result.tweet_ids_not_scrapped) == 1
def search_by_hashtag(): phrase = '#koronawirus' search_tweets_task = st.SearchTweetsTask(all_words=phrase, tweets_limit=200) tweets_collector = st.CollectorTweetOutput() st.TweetSearchRunner(search_tweets_task=search_tweets_task, tweet_outputs=[tweets_collector]).run() scrapped_tweets = tweets_collector.get_raw_list() assert all([ phrase in it.full_text for it in scrapped_tweets if phrase in it.full_text ]) is True
def test_get_not_existing_tweet_in_twitter(): tweets_ids = ['1337071849772093442'] task = st.TweetsByIdTask(tweets_ids) collect_output = st.CollectorTweetOutput() result = st.TweetsByIdRunner( task, [collect_output], web_client=CustomAdapter([ ('https://api.twitter.com/2/search/adaptive.json', RequestResponse(200, _TWITTER_JSON_NO_TWEETS)) ])).run() assert result.downloaded_count == 0 assert len(result.tweet_ids_not_scrapped) == 1
def test_using_proxy_client(): task = st.SearchTweetsTask(all_words='#covid19', tweets_limit=200) proxy_client = st.RequestsWebClient( st.RequestsWebClientProxyConfig(http_proxy='http://localhost:3128', https_proxy='http://localhost:3128')) tweets_collector = st.CollectorTweetOutput() result = st.TweetSearchRunner(search_tweets_task=task, tweet_outputs=[tweets_collector], web_client=proxy_client).run() scrapped_tweets = tweets_collector.get_scrapped_tweets() assert isinstance(result, st.SearchTweetsResult) assert len(scrapped_tweets) == task.tweets_limit
def test_tweet_csv_read_iterator(): file_name = get_temp_test_file_name('csv') collector = st.CollectorTweetOutput() get_tweets_to_tweet_output_test([collector, st.CsvTweetOutput(file_name)]) iterator = st.TweetCsvFileIterator(file_name, 4) list_from_iterator = [] iterator.open() while True: try: list_from_iterator.extend(next(iterator)) except StopIteration: break two_lists_assert_equal(list_from_iterator, collector.get_raw_list())
def test_get_tweets_by_ids(): tweets_ids = ['1337071849772093442', '1337067073051238400'] task = st.TweetsByIdTask(tweets_ids) collect_output = st.CollectorTweetOutput() result = st.TweetsByIdRunner(task, [collect_output], web_client=RequestsWebClient(interceptors=[ LoggingRequestsWebClientInterceptor(), TwitterAuthWebClientInterceptor() ])).run() scrapped_tweets_ids = [it.id_str for it in collect_output.get_raw_list()] assert result.downloaded_count == 1 assert len(collect_output.get_raw_list()) == 1 assert len(result.tweet_ids_not_scrapped) == 1
def test_any_word(): any_phrase = 'kaczynski tusk' search_tweets_task = st.SearchTweetsTask(any_word=any_phrase, tweets_limit=100) tweets_collector = st.CollectorTweetOutput() st.TweetSearchRunner(search_tweets_task=search_tweets_task, tweet_outputs=[tweets_collector]).run() tweet_list_assert_condition( tweets_collector.get_raw_list(), lambda tweet: contains_any_word(any_phrase, tweet.full_text) or contains_any_word(any_phrase, tweet.user_full_name ) or contains_any_word(any_phrase, tweet.user_name))
def test_tweet_json_lines_read_iterator(): file_name = get_temp_test_file_name('jl') collector = st.CollectorTweetOutput() get_tweets_to_tweet_output_test([collector, st.JsonLineFileTweetOutput(file_name)]) iterator = st.TweetJsonLineFileIterator(file_name, 4) list_from_iterator = [] iterator.open() while True: try: list_from_iterator.extend(next(iterator)) except StopIteration: break iterator.close() two_lists_assert_equal(list_from_iterator, collector.get_raw_list())
def test_search_as_replay(): search_tweets_task = st.SearchTweetsTask( all_words='#covid19', tweets_limit=500, replies_filter=st.RepliesFilter.ONLY_REPLIES ) tweets_collector = st.CollectorTweetOutput() st.TweetSearchRunner( search_tweets_task=search_tweets_task, tweet_outputs=[tweets_collector] ).run() tweet_list_assert_condition( tweets_collector.get_raw_list(), lambda tweet: len(tweet.in_reply_to_status_id_str + tweet.in_reply_to_user_id_str) > 0 )
def twitter_report(): username = request.form['twitteruser'] if username: # Configure search_tweets_task = st.SearchTweetsTask(all_words='#covid19' ,tweets_count=20) tweets_collector = st.CollectorTweetOutput() st.TweetSearchRunner(search_tweets_task=search_tweets_task,tweet_outputs=[tweets_collector, st.CsvTweetOutput('output_file.csv')]).run() tweets = tweets_collector.get_scrapped_tweets() return render_template("tweets.html", tweets=tweets, username=username) else: return render_template("error.html")
def test_return_tweets_objects(): phrase = '#koronawirus' search_tweets_task = st.SearchTweetsTask(all_words=phrase, tweets_limit=200) tweets_collector = st.CollectorTweetOutput() result = st.TweetSearchRunner(search_tweets_task=search_tweets_task, tweet_outputs=[tweets_collector]).run() scrapped_tweets = tweets_collector.get_scrapped_tweets() assert isinstance(result, st.SearchTweetsResult) assert result.downloaded_count == len(scrapped_tweets) assert result.downloaded_count > 0 assert all([ phrase in it.full_text for it in scrapped_tweets if phrase in it.full_text ]) is True
def get_tweets() -> List[st.Tweet]: collect_tweet_output = st.CollectorTweetOutput() task = st.SearchTweetsTask(all_words="#covid19", tweets_limit=100) st.TweetSearchRunner(task, [collect_tweet_output]).run() return collect_tweet_output.get_scrapped_tweets()