def arquive_search(self,
                       query,
                       start,
                       end,
                       dev_env,
                       max_size=2500,
                       max_call=100):
        self.settings['search_tweets_api']['endpoint'] =\
           f"https://api.twitter.com/1.1/tweets/search/fullarchive/{dev_env}.json"

        credentials = load_credentials("archive_keys.yaml",
                                       yaml_key="search_tweets_api",
                                       env_overwrite=False)

        with open('archive_keys.yaml', 'w') as config_file:
            yaml.dump(self.settings, config_file, default_flow_style=False)

        q_rule = gen_rule_payload(query,
                                  results_per_call=max_call,
                                  from_date=start,
                                  to_date=end)

        rs = ResultStream(rule_payload=q_rule,
                          max_results=max_size,
                          **credentials)

        with open('tweet_data_archive.csv', 'a', encoding='utf-8') as file:
            n = 0
            for tweet in rs.stream():
                n += 1
                if n % (max_size / 10) == 0:
                    print('{0}: {1}'.format(str(n), tweet['created_at']))
                json.dump(tweet, file)
                file.write('\n')
示例#2
0
def main():
    args_dict = vars(parse_cmd_args().parse_args())
    if args_dict.get("debug") is True:
        logger.setLevel(logging.DEBUG)
        logger.debug("command line args dict:")
        logger.debug(json.dumps(args_dict, indent=4))

    if args_dict.get("config_filename") is not None:
        configfile_dict = read_config(args_dict["config_filename"])
    else:
        configfile_dict = {}

    extra_headers_str = args_dict.get("extra_headers")
    if extra_headers_str is not None:
        args_dict['extra_headers_dict'] = json.loads(extra_headers_str)
        del args_dict['extra_headers']

    logger.debug("config file ({}) arguments sans sensitive args:".format(
        args_dict["config_filename"]))
    logger.debug(json.dumps(_filter_sensitive_args(configfile_dict), indent=4))

    creds_dict = load_credentials(filename=args_dict["credential_file"],
                                  account_type=args_dict["account_type"],
                                  yaml_key=args_dict["credential_yaml_key"],
                                  env_overwrite=args_dict["env_overwrite"])

    dict_filter = lambda x: {k: v for k, v in x.items() if v is not None}

    config_dict = merge_dicts(dict_filter(configfile_dict),
                              dict_filter(creds_dict), dict_filter(args_dict))

    logger.debug("combined dict (cli, config, creds) sans password:"******"ERROR: not enough arguments for the program to work")
        sys.exit(1)

    stream_params = gen_params_from_config(config_dict)
    logger.debug(
        "full arguments passed to the ResultStream object sans password")
    logger.debug(json.dumps(_filter_sensitive_args(stream_params), indent=4))

    rs = ResultStream(tweetify=False, **stream_params)

    logger.debug(str(rs))

    if config_dict.get("filename_prefix") is not None:
        stream = write_result_stream(
            rs,
            filename_prefix=config_dict.get("filename_prefix"),
            results_per_file=config_dict.get("results_per_file"))
    else:
        stream = rs.stream()

    for tweet in stream:
        if config_dict["print_stream"] is True:
            print(json.dumps(tweet))
示例#3
0
def collect_tweets_in_files():
    """ Using a ResultStream for getting tweets
      We can configure the amount of pages/tweets we want to obtain """

    if not check_files():  # file should not already be existing

        max_results = 10000
        max_pages = 300
        max_tweets = 15000

        rs = ResultStream(request_parameters=query,
                          max_results=max_results,
                          max_pages=max_pages,
                          **credentials)

        # Set how many tweets we want to catch
        rs.max_tweets = max_tweets

        tweets_2 = list(rs.stream())
        dataframe = pandas.DataFrame(tweets_2)

        csv_file = dataframe.to_csv(saving_path)
    else:
        print(
            FileExistsError,
            'File already exists! Please check if you really want to overwrite the file.'
        )
示例#4
0
def search(queryString, outputpath, api_key_yaml,startTime="2016-01-01",endTime="2021-03-15", lang="en"):

    search_args = load_credentials(api_key_yaml,
                                   yaml_key="search_tweets_v2",
                                   env_overwrite=False)

    print("Should be 1024, but it:")
    print(len(queryString + " -is:nullcast -is:retweet -is:verified -is:quote " + "lang:"+lang))

    #,created_at,geo,in_reply_to_user_id,lang,author_id,conversation_id,public_metrics,entities,context_annotations
    query = gen_request_parameters(query=queryString.strip() + " -is:nullcast -is:retweet -is:verified -is:quote " + "lang:"+lang, media_fields="media_key,type",user_fields="id,description,location,name,entities,url,username,public_metrics,verified,withheld,protected",tweet_fields="id,text,created_at,geo,in_reply_to_user_id,lang,author_id,conversation_id,public_metrics,entities,context_annotations,attachments",start_time=startTime,end_time=endTime, stringify=False, expansions="author_id,attachments.media_keys",results_per_call=500)

    rs = ResultStream(request_parameters=query, max_tweets=sys.maxsize, max_requests=sys.maxsize, **search_args)
    i = 0
    with open(outputpath, 'w') as outputcsv:
        writer = csv.writer(outputcsv)
        writer.writerow(headers)
        for tweet in rs.stream():
            # print(tweet)
            if "id" in tweet:
                writer.writerow(createRow(headers, tweet))
            if "users" in tweet:
                print("parsing users")
                dump_users_info(tweet,outputpath.replace(".csv",str(i) +"-users.csv"))
                i+=1
示例#5
0
    def gather_data(self, screen_name: str,  user_id: int, rt_date: str, file_path: str):
        query_str = create_query_str(screen_name)
        # print(f'reconstructing timeline for @{screen_name}')

        time_range = get_start_and_end_date(rt_date)
        query_obj = create_query_obj(query_str, *time_range)
        rs = ResultStream(
            request_parameters=query_obj,
            # parameter changed from 2 -> 1 to avoid being ratelimited within the project timeline
            max_requests=1,
            **self.academic_search_args
        )
        inbound_timeline = []

        replies = []
        retweets = []
        quotes = []

        for tweet in rs.stream():
            if "author_id" not in tweet:
                if "tweets" in tweet:
                    # Tweets are found
                    for t in tweet["tweets"]:
                        if int(t["author_id"]) == user_id:
                            if "referenced_tweets" in t:
                                ref_tweets = t["referenced_tweets"]
                                for ref in ref_tweets:
                                    type = ref["type"]
                                    if type == "replied_to":
                                        replies.append(ref["id"])
                                    elif type == "quoted":
                                        quotes.append(ref["id"])
                            else:
                                # normal tweet, which holds no info on the information strength
                                pass
                        else:
                            if "referenced_tweets" not in t:
                                # the only way this situation can occur is when the tweet is retweeted by the autor
                                # and someone is replying to that retweet
                                retweets.append(t["author_id"])
                            else:
                                # this indicates a reply with a quote, or a reply of a reply
                                pass

        # print(f"done collecting the retweeted user objects, there are {len(retweets)} in total")

        # print(f"converting the {len(replies)} replied tweet objects to user ids")
        replies = self.gather_users(replies)
        # print(f"done collecting the replies user objects, there are {len(replies)} in total")

        # print(f"converting the {len(quotes)} quoted tweet objects to user ids")
        quotes = self.gather_users(quotes)
        # print(f"done collecting the quotes user objects, there are {len(quotes)} in total")

        # print(f"retweets: {len(retweets)}\treplies: {len(replies)}\tquotes: {len(quotes)}")

        dump_dict = {"replies": replies, "quotes": quotes, "retweets": retweets}
        json.dump(dump_dict, open(file_path, "w"))
def _download_tweets(trend, enterprise_search_args):
    powertrack_rule = '(has:geo OR has:profile_geo) lang:en -is:retweet %s' % trend
    rule = gen_rule_payload(powertrack_rule, results_per_call=500)
    rs = ResultStream(rule_payload=rule,
                      max_requests=2,
                      **enterprise_search_args)
    for tweet in rs.stream():
        print(tweet)
        _store_tweet(tweet)
示例#7
0
def usersTweetsByIds():

    search_args1 = load_credentials(".twitter_keys.yaml",
                                    yaml_key="search_tweets_v2_id",
                                    env_overwrite=False)

    search_args2 = load_credentials(".twitter_keys.yaml",
                                    yaml_key="search_tweets_v2_user",
                                    env_overwrite=False)

    f = open(
        'C:\\Users\\Josh\\Documents\\GitHub\\search-tweets-python\\enUsers_Tweets.json',
        'r',
        encoding='utf-8')

    obj = json.load(f)

    for u in obj['includes']:

        idList = u.get('tweetids')

        ids = ''

        idList = list(set(idList))

        if len(idList) == 0:
            u['tweets'] = []
            continue

        if len(idList) > 99:
            ids = ','.join(idList[0:99])
        else:
            ids = ','.join(idList)

        endTweet = 'https://api.twitter.com/2/tweets'

        query = {"ids": ids, "tweet.fields": "author_id,public_metrics,text"}
        rs = ResultStream(request_parameters=query,
                          endpoint=endTweet,
                          bearer_token=bt)

        tweets = []
        result = list(rs.stream())

        for r in result:

            tweets = r.get('data')

        u['tweets'] = tweets

    fo = open('Random_WithTweets.json', 'w', encoding='utf-8')
    json.dump(obj, fo)
示例#8
0
def _download_tweets(trend):
    powertrack_rule = '%s (has:geo OR has:profile_geo) lang:en -is:retweet' % trend
    rule = gen_rule_payload(powertrack_rule,
                            results_per_call=500,
                            to_date=None,
                            from_date='201207220000')
    logging.info("PowerTrack rule: %s" % rule)
    rs = ResultStream(rule_payload=rule,
                      max_results=500,
                      max_requests=1,
                      **enterprise_search_args)
    for tweet in rs.stream():
        _push_tweet(tweet, trend)
示例#9
0
def main():
    parser = parse_cmd_args()
    args_dict = vars(parse_cmd_args().parse_args())
    if args_dict.get("debug") is True:
        logger.setLevel(logging.DEBUG)
        logger.debug(json.dumps(args_dict, indent=4))

    if args_dict.get("config_filename") is not None:
        configfile_dict = read_config(args_dict["config_filename"])
    else:
        configfile_dict = {}

    creds_dict = load_credentials(filename=args_dict["credential_file"],
                                  account_type=args_dict["account_type"],
                                  yaml_key=args_dict["credential_yaml_key"],
                                  env_overwrite=args_dict["env_overwrite"])

    dict_filter = lambda x: {k: v for k, v in x.items() if v is not None}

    config_dict = merge_dicts(dict_filter(configfile_dict),
                              dict_filter(args_dict), dict_filter(creds_dict))

    logger.debug(json.dumps(config_dict, indent=4))

    if len(dict_filter(config_dict).keys()
           & REQUIRED_KEYS) < len(REQUIRED_KEYS):
        print(REQUIRED_KEYS - dict_filter(config_dict).keys())
        logger.error("ERROR: not enough arguments for the program to work")
        sys.exit(1)

    stream_params = gen_params_from_config(config_dict)

    logger.debug(json.dumps(config_dict, indent=4))

    rs = ResultStream(tweetify=False, **stream_params)

    logger.debug(str(rs))

    if config_dict.get("filename_prefix") is not None:
        stream = write_result_stream(
            rs,
            filename_prefix=config_dict["filename_prefix"],
            results_per_file=config_dict["results_per_file"])
    else:
        stream = rs.stream()

    for tweet in stream:
        if config_dict["print_stream"] is True:
            print(json.dumps(tweet))
示例#10
0
    def tw_get_premium_search(self, keyword: str):
        with open(f'datasets/tw_{keyword.lower()}_searches_premium.json',
                  'w') as f:
            try:
                f.write('{"statuses": [')

                rule = gen_rule_payload(
                    pt_rule="near:\"New York, NY\" within:50mi".format(),
                    results_per_call=100,
                    from_date="2018-07-01",
                    to_date="2018-10-01")

                rule = gen_rule_payload(
                    pt_rule="place:\"New York, NY\"".format(),
                    results_per_call=100,
                    from_date=(datetime.date.today() -
                               datetime.timedelta(31)).isoformat(),
                    to_date=datetime.date.today().isoformat())

                next_token = None
                while True:
                    results = ResultStream(rule_payload=rule,
                                           **self.twitter_premium_api)
                    results.next_token = next_token

                    tweets = []

                    try:
                        tweets = list(results.stream())
                    except Exception as ex:
                        print(str(ex))

                    for tweet in tweets:
                        f.write("%s," % json.dumps(tweet))

                    if results.next_token is None:
                        break
                    else:
                        next_token = results.next_token

                next_token is not None and f.seek(f.tell() - 1, os.SEEK_SET)
                f.write("]}")

            except Exception as ex:
                print("Error:\n" + str(ex))
示例#11
0
def get_data(search_query, api_key, secret_key, to_date, from_date, filename):
    """ get twitter data through twitter API from full archive search sand box and return all twitters in JSONL file
    based on 
     search term, 
     the geographic location of interest
     the time period of interest.
     and personal twitter account information.

     Reference: https://github.com/geduldig/TwitterAPI/tree/master/TwitterAPI
     Reference: https://developer.twitter.com/en/docs/tweets/search/overview
    """
    print_after_x = 1000
    config = dict(
        search_tweets_api=dict(
            account_type='premium',
            endpoint=f"https://api.twitter.com/1.1/tweets/search/{'fullarchive'}/{'mangroveConservation'}.json",
            consumer_key=api_key,
            consumer_secret=secret_key
        )
    )
    with open('twitter_keys.yaml', 'w') as config_file:
        yaml.dump(config, config_file, default_flow_style=False)
    from searchtweets import load_credentials, gen_rule_payload, ResultStream

    premium_search_args = load_credentials("twitter_keys.yaml",
                                           yaml_key="search_tweets_api",
                                           env_overwrite=False)
    rule = gen_rule_payload(search_query,
                            results_per_call=100,
                            from_date=from_date,
                            to_date=to_date
                            )
    temp = ResultStream(rule_payload=rule,
                      max_results=100000,
                      **premium_search_args)
    with open(filename, 'a', encoding='utf-8') as temp_file:
        num = 0
        for tweet in temp.stream():
            num += 1
            if num % print_after_x == 0:
                print('{0}: {1}'.format(str(num), tweet['created_at']))
            json.dump(tweet, temp_file)
            temp_file.write('\n')
    print('done')
def save_old_tweets():
    from searchtweets import load_credentials, gen_rule_payload, ResultStream
    import json

    premium_search_args = load_credentials("twitter_keys_fullarchive.yaml",
                                           yaml_key="search_tweets_api",
                                           env_overwrite=False)

    query = "from:NTOO_Org"
    rule = gen_rule_payload(query, results_per_call=100)

    rs = ResultStream(rule_payload=rule,
                      max_results=1000,
                      **premium_search_args)

    with open('fullTweetsData.json', 'a', encoding='utf-8') as f:
        for tweet in rs.stream():
            json.dump(tweet, f)
            f.write('\n')
示例#13
0
def tweet_search(search_key, search_args):
    """
    search for "spectrumtv" and create a dict of tweet timestamp (dictionary key, in epoch seconds),
                                                 tweet authors screen name (dict value, tuple element 1),
                                                 tweet text (dict value, tuple element 2)
    """
    print("searching for tweets containing \"{}\"".format(search_key))
    key_rule = gen_rule_payload(search_key, results_per_call=100)
    key_rs = ResultStream(rule_payload=key_rule,
                          max_results=500,
                          max_pages=1,
                          **search_args)
    key_results = list(key_rs.stream())
    key_tweets = {}
    for tweet in key_results:
        key_tweets[tweet.created_at_seconds] = (
            tweet.screen_name, tweet.all_text.replace('\n', ' '), ' '
        )  # this space is a placeholder for the sentiment value
    print("{} tweets found containing \"{}\"\n".format(len(key_results),
                                                       search_key))
    return key_tweets
    def pull_data_for_handle(self,
                             handle,
                             date,
                             days_before,
                             results_per_call=100,
                             max_results=2500):
        # check handle can be found!
        user_id = self.get_handle_id(handle)
        if user_id is 0:
            return 0
        from_date = self.subtract_from_datestring(date, days_before)
        rule = self.make_rule(handle, date, from_date, results_per_call)

        rs = ResultStream(rule_payload=rule,
                          max_results=max_results,
                          **self.endpoint_args)
        results_list = list(rs.stream())
        #         results_list=temp_dict[list(temp_dict.keys())[0]]
        print('Found', len(results_list), 'tweets for', handle)
        if len(results_list) == max_results:
            print('Max results limit hit (' + str(2500) +
                  '). Consider changing the parameter')

        return self.strip_maxresults_from_query(rule), results_list
示例#15
0
def getRecentTweets():
    endRecent = 'https://api.twitter.com/2/tweets/search/recent'

    search_args_rec = load_credentials(".twitter_keys.yaml",
                                       yaml_key="search_tweets_v2_recent",
                                       env_overwrite=False)

    query = {
        "max_results": 100,
        "tweet.fields": "public_metrics,author_id,lang",
        "query":
        "happy -RT OR upset -RT OR lol -RT OR ugh -RT OR dog -RT OR cat -RT OR food -RT OR sucks -RT",
        "expansions": "author_id",
        "user.fields": "public_metrics"
    }

    rs = ResultStream(
        request_parameters=query,
        endpoint=endRecent,
        bearer_token=bt,
        max_tweets=100,
        max_requests=1,
    )
    result = list(rs.stream())

    obj = {}

    obj['data'] = []
    obj['includes'] = []

    for r in result:
        obj['data'] = obj['data'] + r.get('data')
        obj['includes'] = obj['includes'] + r.get('includes').get('users')

    out = open('testJson.json', 'w')
    json.dump(obj, out)
示例#16
0
def get_twitter_results(news_id,
                        query,
                        from_date,
                        premium_search_args,
                        filename,
                        to_date="202005260000"):
    query1 = "url:" + query + " lang:en"

    rule = gen_rule_payload(query1,
                            from_date=from_date,
                            to_date=to_date,
                            results_per_call=100)

    rs = ResultStream(rule_payload=rule,
                      max_results=100,
                      **premium_search_args)
    l = 0
    with open(filename, 'a', encoding='utf-8') as f:
        n = 0
        for tweet in rs.stream():
            news_tweet_json = {
                "news_id": news_id,
                "query": query,
                "tweet": tweet
            }

            n += 1
            if n % 10 == 0:
                print('{0}: {1}'.format(str(n), tweet['created_at']))
            json.dump(news_tweet_json, f)
            f.write('\n')
            l = datetime.strptime(tweet['created_at'],
                                  "%a %b %d %H:%M:%S +%f %Y").date()
    print(rs, type(l), l)
    print('done')
    return l
示例#17
0
def main():
    args_dict = vars(parse_cmd_args().parse_args())
    if args_dict.get("debug") is True:
        logger.setLevel(logging.DEBUG)
        logger.debug("command line args dict:")
        logger.debug(json.dumps(args_dict, indent=4))

    if args_dict.get("config_filename") is not None:
        configfile_dict = read_config(args_dict["config_filename"])
    else:
        configfile_dict = {}

    extra_headers_str = args_dict.get("extra_headers")
    if extra_headers_str is not None:
        args_dict['extra_headers_dict'] = json.loads(extra_headers_str)
        del args_dict['extra_headers']

    logger.debug("config file ({}) arguments sans sensitive args:".format(
        args_dict["config_filename"]))
    logger.debug(json.dumps(_filter_sensitive_args(configfile_dict), indent=4))

    creds_dict = load_credentials(filename=args_dict["credential_file"],
                                  yaml_key=args_dict["credential_yaml_key"],
                                  env_overwrite=args_dict["env_overwrite"])

    dict_filter = lambda x: {k: v for k, v in x.items() if v is not None}

    config_dict = merge_dicts(dict_filter(configfile_dict),
                              dict_filter(creds_dict), dict_filter(args_dict))

    logger.debug("combined dict (cli, config, creds):")
    logger.debug(json.dumps(_filter_sensitive_args(config_dict), indent=4))

    if len(dict_filter(config_dict).keys()
           & REQUIRED_KEYS) < len(REQUIRED_KEYS):
        print(REQUIRED_KEYS - dict_filter(config_dict).keys())
        logger.error("ERROR: not enough arguments for the script to work")
        sys.exit(1)

    stream_params = gen_params_from_config(config_dict)
    logger.debug(
        "full arguments passed to the ResultStream object sans credentials")
    logger.debug(json.dumps(_filter_sensitive_args(stream_params), indent=4))

    while True:

        start = time.time()
        rs = ResultStream(tweetify=False, **stream_params)

        logger.debug(str(rs))

        if config_dict.get("filename_prefix") is not None:
            stream = write_result_stream(
                rs,
                filename_prefix=config_dict.get("filename_prefix"),
                results_per_file=config_dict.get("results_per_file"))
        else:
            stream = rs.stream()

        first_tweet = True
        tweets_num = 0

        #Iterate through Tweet array and handle output.
        for tweet in stream:
            tweets_num = tweets_num + 1
            #Get Tweet ID from first Tweet
            if first_tweet:
                newest_id = tweet['id']
                first_tweet = False
            if config_dict["print_stream"] is True:
                print(json.dumps(tweet))

        #This polling script switches to a since_id requests and removes the start_time parameter if it is used for backfill.
        #Prepare next query, by setting the since_id request parameter.
        print(f"{tweets_num} new Tweets. Newest_id: {newest_id}")

        request_json = json.loads(stream_params['request_parameters'])

        if 'start_time' in request_json.keys():
            del request_json['start_time']

        request_json.update(since_id=newest_id)
        stream_params['request_parameters'] = json.dumps(request_json)

        duration = time.time() - start

        sleep_interval = (float(config_dict["interval"]) * 60) - duration

        if sleep_interval < 0:
            sleep_interval = (float(config_dict["interval"]) * 60)

        time.sleep(sleep_interval)
示例#18
0
def extract_tweets():
    today = date.today()
    d1 = today.strftime("%d-%m-%Y")

    with open('config.json','r') as f:
        keys = json.load(f)

    config = dict(
        search_tweets_api = dict(
            account_type = 'premium',
            endpoint = 'https://api.twitter.com/1.1/tweets/search/30day/development1.json',
            consumer_key = keys['consumer_key'],
            consumer_secret = keys['consumer_secret'])
            )
    with open('twitter_keys_fullhistory.yaml', 'w') as config_file:
        yaml.dump(config, config_file, default_flow_style=False)

    premium_search_args = load_credentials("twitter_keys_fullhistory.yaml",
                                        yaml_key="search_tweets_api",
                                        env_overwrite=False)

    SEARCH_QUERY = 'to:Lloydsbank'
    RESULTS_PER_CALL = 100
    FROM_DATE = "2020-06-01"
    TO_DATE = "2020-06-10"
    MAX_RESULTS = 100000
    FILENAME = 'twitter_input_data_{}_{}.jsonl'.format(FROM_DATE, TO_DATE)  # Where the Tweets should be saved
    PRINT_AFTER_X = 100


    rule = gen_rule_payload(SEARCH_QUERY,
                            results_per_call=RESULTS_PER_CALL,
                            from_date=FROM_DATE,
                            to_date=TO_DATE
                            )

    rs = ResultStream(rule_payload=rule,
                    max_results=MAX_RESULTS,
                    **premium_search_args)

    with open(FILENAME, 'a', encoding='utf-8') as f:
        n = 0
        for tweet in rs.stream():
            n += 1
            if n % PRINT_AFTER_X == 0:
                print('{0}: {1}'.format(str(n), tweet['created_at']))
            json.dump(tweet, f)
            f.write('\n')


    new_tweets = []
    dates_created = []
    location = []
    user= []

    with open(FILENAME, 'rb') as f:
        for item in json_lines.reader(f):
            try:
                new_tweets.append(item['extended_tweet']['full_text'])
            except KeyError as e:
                new_tweets.append(item['text'])
                dates_created.append(item['created_at'])
                location.append(item['user']['location'])
                user.append(item['user']['id'])

    dataframe = pd.DataFrame(list(zip(user, location, dates_created, new_tweets)), 
                columns =['User', 'Location', 'date_created', 'text'])
    print(dataframe.head())
    dataframe.to_csv("tweets.csv", sep =",")
示例#19
0
from searchtweets import gen_rule_payload
from searchtweets import load_credentials

premium_search_args = load_credentials("twitter_keys_fullarchive.yaml",
                                       yaml_key="search_tweets_api",
                                       env_overwrite=False)
print(premium_search_args)
query = "AAPL"
rule = gen_rule_payload(query,
                        results_per_call=100,
                        from_date="2020-10-21",
                        to_date="2020-10-28")
from searchtweets import ResultStream

rs = ResultStream(rule_payload=rule, max_results=25000, **premium_search_args)
print(rs)
import json
with open('tweets1.json', 'a', encoding='utf-8') as f:
    for tweet in rs.stream():
        json.dump(tweet, f)
        f.write('\n')
print('done')
                          max_tweets=config['max_tweets'],
                          **search_creds)

        # number of reconnection tries
        tries = 10

        # while loop to protect against 104 error
        while True:
            tries -= 1
            # attempt retrieving tweets
            try:
                # indicate which day is getting retrieved
                print('[INFO] - Retrieving tweets from ' + str(start_ts))

                # get json response to list
                tweets = list(rs.stream())

                # break free from while loop
                break
            except Exception as err:
                if tries == 0:
                    raise err
                else:
                    print(
                        '[INFO] - Got connection error, waiting 15 seconds and trying again. '
                        + str(tries) + ' tries left.')
                    time.sleep(15)

        # parse results to dataframe
        print('[INFO] - Parsing tweets from ' + str(start_ts))
        tweetdf = v2parser(tweets, config['results_per_call'])
 def write_stream(self):
     """ write ResultStream object to disk using the write_ndjson utility """
     stream = ResultStream(**self.premium_search_args, rule_payload=self.rule, max_results=62000)
     columns = []
     for _ in write_ndjson('US_apr02_apr09_some.json', stream.stream()):  # exhaust generator
         pass
示例#22
0
start_date = today + datetime.timedelta(-30)
print(start_date)

rule = gen_rule_payload("from:NYCASP",
                        from_date=str(start_date),
                        to_date=str(today),
                        results_per_call=500)

print(rule)

rs = ResultStream(rule_payload=rule, max_results=500, **premium_search_args)

print(rs)

tweets = rs.stream()
list_tweets = list(tweets)
[print(tweet.all_text, end="\n\n") for tweet in list_tweets[0:100]]

tweet_text = []
tweet_date = []

for tweet in list_tweets:
    tweet_text.append(tweet["text"])
    tweet_date.append(tweet["created_at"])

df = pd.DataFrame({"tweet": tweet_text, "date": tweet_date})
df.head()

client = twilio_connect()
示例#23
0
def pull_tweets(query,
                from_date,
                to_date,
                save_path,
                credentials_path,
                yaml_key,
                file_name=None,
                results_per_call=500,
                max_results=3000,
                verbose=False,
                **kwargs):
    """
    Pulls data (i.e., tweets and user info) from Twitter using its API.
    The data received from the API is stored in its original form (JSON)
    without performing any type of preprocessing.

    Parameters
    ----------
    query : str
        Query passed to the Twitter API to fecth Tweets.
    from_date : str or None
        Date format as specified by `convert_utc_time` for the starting time
        of your search.
    to_date : str or None
        Date format as specified by `convert_utc_time` for the end time of
        your search.
    save_path : str
        Path where the raw data will be stored.
    credentials_path : str
        Path for the yaml file with the Twitter API credentials.
    yaml_key : str
        Key within the yaml file containing the Twitter API credentials to be
        used.
    file_name : str or None, default=None
        Name of the json file saved containing the data dump. If None, the
        named will be assigned as a function of `query`, `from_date` and
        `to_date`.
    results_per_call : int, default=500
        Number of Tweets returned per call.
    max_results : int, default=3000
        Maximum number of Tweets to be pulled.
    verbose : int or bool, default=False
        Controls the verbosity when pulling data.


    Returns
    -------
    None : NoneType
    """

    logger = logging.getLogger(__name__)
    logger.propagate = verbose
    logger.info('Pulling raw Twitter data')

    search_args = load_credentials(filename=credentials_path,
                                   yaml_key=yaml_key)

    rule = gen_rule_payload(query,
                            results_per_call=results_per_call,
                            from_date=from_date,
                            to_date=to_date)

    rs = ResultStream(rule_payload=rule,
                      max_results=max_results,
                      **search_args)

    if file_name is None:
        file_name = f'SAMPLE_DATA_QUERY_{query}_'\
                  + f'FROMDATE_{from_date}_TODATE_{to_date}.json'

    with open(os.path.join(save_path, file_name), 'a', encoding='utf-8') as f:
        for tweet in rs.stream():
            json.dump(tweet, f)
            f.write('\n')

    logger.info('Data successfuly saved at' +
                f'\"{os.path.join(save_path, file_name)}\"')
    return None