def run(self):
        print('Worker started')
        # do some initialization here

        while True:
            data = self.queue.get(True)
            try:
                if data is None:
                    print('ALL FINISHED!!!!', self.conn_number)
                    break

                print('Starting: ', data)
                if self.gets_user_id:
                    user = TwitterUser(self.api_hook, user_id=data)
                else:
                    user = TwitterUser(self.api_hook, screen_name=data)

                user.populate_tweets_from_api(json_output_directory=os.path.join(self.out_dir,"json"))

                if len(user.tweets) == 0:
                    if self.to_pickle or self.populate_lists or self.populate_friends or self.populate_followers:
                        print 'pickling and dumping: ', user.screen_name
                        pickle.dump(user, open(os.path.join(self.out_dir,"obj",data), "wb"))
                    continue
                if self.populate_lists:
                    user.populate_lists_member_of()

                if self.populate_friends:
                    print 'populating friends, ', user.screen_name
                    user.populate_friends()

                if self.populate_followers:
                    print 'populating followers, ', user.screen_name
                    user.populate_followers()

                if self.to_pickle or self.populate_lists or self.populate_friends or self.populate_followers:
                    # Pickle and dump user
                    print 'pickling and dumping (no tweets): ', user.screen_name
                    user.tweets = []
                    pickle.dump(user, open(os.path.join(self.out_dir,"obj",data), "wb"))
            except Exception:
                print('FAILED:: ', data)
                exc_type, exc_value, exc_traceback = sys.exc_info()
                print("*** print_tb:")
                traceback.print_tb(exc_traceback, limit=30, file=sys.stdout)
                print("*** print_exception:")

            print('finished collecting data for: ', data)
    def get_user_network(self, this_user_network_dir_name, user_ids, restrict_output_to_ids, stored_user_list):
        counter_val = 0
        for uid in user_ids:
            counter_val += 1
            if counter_val % 10 == 0:
                print (counter_val, " / ", len(user_ids), this_user_network_dir_name.replace(self.network_dir, ""))

            # try to find user in stored_users
            if str(uid) in stored_user_list:
                user = pickle.load(open(self.pickle_dir + "/" + str(uid), "rb"))
            else:
                user = TwitterUser(self.api_hook, user_id=uid)
                user.populate_tweets_from_api()
                out_fil = open(self.pickle_dir + "/" + str(uid), "wb")
                pickle.dump(user, out_fil)
                out_fil.close()

            self.write_user_network(this_user_network_dir_name, user, uid, restrict_output_to_ids)
    def run(self):
        print ("Worker started")

        while True:

            user_id, snow_sample_number = self.queue.get(True)

            print "Starting: ", user_id, snow_sample_number

            stored_user_list = set(
                [os.path.basename(user_pickle) for user_pickle in glob.glob(os.path.join(self.out_dir, "obj", "*"))]
            )

            # Get the ego
            if user_id in stored_user_list:
                print ("\tgot pickled: ", user_id)
                user = pickle.load(open(os.path.join(self.out_dir, "obj", str(user_id)), "rb"))
            else:
                user = TwitterUser(self.api_hook, user_id=user_id)
                user.populate_tweets_from_api(json_output_directory=os.path.join(self.out_dir, "json"))

                if len(user.tweets) == 0:
                    print "pickling and dumping: ", user.screen_name
                    pickle.dump(user, open(os.path.join(self.out_dir, "obj", user_id), "wb"))
                    continue
                print "populating friends, ", user.screen_name
                user.populate_friends()

                print "pickling and dumping (no tweets): ", user.screen_name
                user.tweets = []
                pickle.dump(user, open(os.path.join(self.out_dir, "obj", user_id), "wb"))

            ##write out their following network and add each id to queue
            # network_fil = codecs.open(os.path.join(self.network_dir,user_id),"w", "utf-8")
            added = 0
            for following_id in user.mentioned.keys():
                if snow_sample_number < self.step_count:
                    added += 1
                    self.queue.put([str(following_id), snow_sample_number + 1])
                # network_fil.write(",".join([user_id,str(following_id)])+"\n")
            # network_fil.close()

            print "finished collecting data for: ", user_id
            print "added: ", added
    def run(self):
        print ("Worker started")

        while True:

            user_id, snow_sample_number = self.queue.get(True)

            print ("Starting: ", user_id, snow_sample_number)

            stored_user_list = set([os.path.basename(user_pickle) for user_pickle in glob.glob(self.pickle_dir + "*")])

            # Get the ego
            if user_id in stored_user_list:
                print ("\tgot pickled: ", user_id)
                user = pickle.load(open(self.pickle_dir + "/" + str(user_id), "rb"))
            else:
                user = TwitterUser(self.api_hook, user_id=user_id)
                print ("\tgetting tweets for: ", user_id)
                user.populate_tweets_from_api()
                print ("\t num tweets received for: ", user_id, " ", len(user.tweets))
                # print '\tgetting followers for: ', screen_name
                # user.populate_followers()

                print ("\tgetting friends for: ", user_id)
                user.populate_friends()

                print ("pickling: ", user_id)
                pickle.dump(user, open(self.pickle_dir + "/" + user_id, "wb"))

            ##write out their following network and add each id to queue
            network_fil = codecs.open(os.path.join(self.network_dir, user_id), "w", "utf-8")
            added = 0
            for following_id in user.friend_ids:
                if snow_sample_number < 2:
                    added += 1
                    self.queue.put([str(following_id), snow_sample_number + 1])
                network_fil.write(",".join([user_id, str(following_id)]) + "\n")
            network_fil.close()

            print "finished collecting data for: ", user_id
            print "added: ", added
示例#5
0
print 'n authed users: ', len(handles)

out_dir = sys.argv[2]
os.mkdir(out_dir)

#user_sns = [line.strip() for line in open(sys.argv[3]).readlines()]
user_sns = ['Neuro_Skeptic']

print 'num users: ', len(user_sns)

of = codecs.open("output_fil.tsv", "w", "utf8")
for i in range(len(user_sns)):
    #creates a Twitter User object to fill with information from the API
    user = TwitterUser(handles[i], screen_name=user_sns[i])
    user.populate_tweets_from_api(json_output_filename=out_dir + user_sns[i] +
                                  ".json",
                                  sleep_var=False)
    user.populate_followers()
    rts = 0
    gt = 0
    for t in user.tweets:
        if t.retweeted is not None:
            rts += 1
        if t.geocode_info is not None:
            gt += 1

    of.write(
        tab_stringify_newline([
            user.screen_name, gt, rts,
            len(user.tweets), user.earliest_tweet_time, user.latest_tweet_time,
            user.name, user.n_total_tweets, user.creation_date,
示例#6
0
username_to_collect_data_for = 'Jackie_Pooo'

consumer_key = "YOUR_CONSUMER_KEY_HERE"
consumer_secret = "YOUR_CONSUMER_SECRET_HERE"
access_token = "YOUR_ACCESS_TOKEN_HERE"
access_token_secret = "YOUR_ACCESS_TOKEN_SECRET_HERE"

## get a "hook", or connection, to the API using your consumer key/secret and access token/secret
api_hook = TwitterAPIHook(consumer_key,consumer_secret,
                          access_token=access_token,access_token_secret=access_token_secret)

#creates a Twitter User object to fill with information from the API
user = TwitterUser(api_hook,screen_name=username_to_collect_data_for)


# we call populate_tweets_from_api,which goes to the Twitter API
# and collects the user's data it is outputted to the file username_you_put.json
# the sleep_var param tells the function it shouldn't worry
# about rate limits (we're only collecting for one user, so it doesn't really matter
# If you remove the is_gzip argument, the output file will be gzipped
print 'populating users tweets!'
user.populate_tweets_from_api(json_output_filename=username_to_collect_data_for+".json",
                              sleep_var=False, is_gzip=False, since_id=None)


for t in user.tweets:
    print t.mentions
print 'user had {n_tweets} tweets'.format(n_tweets=len(user.tweets))


handles = get_handles(glob.glob(os.path.join(sys.argv[1],"*.txt")))
print 'n authed users: ', len(handles)

out_dir = sys.argv[2]
os.mkdir(out_dir)

#user_sns = [line.strip() for line in open(sys.argv[3]).readlines()]
user_sns = ['Neuro_Skeptic']

print 'num users: ', len(user_sns)

of = codecs.open("output_fil.tsv","w","utf8")
for i in range(len(user_sns)):
    #creates a Twitter User object to fill with information from the API
    user = TwitterUser(handles[i], screen_name=user_sns[i])
    user.populate_tweets_from_api(json_output_filename=out_dir+user_sns[i]+".json",
                                  sleep_var=False)
    user.populate_followers()
    rts = 0
    gt = 0
    for t in user.tweets:
        if t.retweeted is not None:
            rts+=1
        if t.geocode_info is not None:
            gt +=1

    of.write(tab_stringify_newline([user.screen_name,
                                 gt,
                                 rts,
                                len(user.tweets),
                                user.earliest_tweet_time,
                                user.latest_tweet_time,
access_token = "YOUR_ACCESS_TOKEN_HERE"
access_token_secret = "YOUR_ACCESS_TOKEN_SECRET_HERE"


## get a "hook", or connection, to the API using your consumer key/secret and access token/secret
api_hook = TwitterAPIHook(consumer_key,consumer_secret,
                          access_token=access_token,access_token_secret=access_token_secret)

#creates a Twitter User object to fill with information from the API
user = TwitterUser(api_hook, screen_name=username_to_collect_data_for)


# we call populate_tweets_from_api,which goes to the Twitter API
# and collects the user's data it is outputted to the file username_you_put.json
# the sleep_var param tells the function it shouldn't worry
# about rate limits (we're only collecting for one user, so it doesn't really matter
# If you remove the is_gzip argument, the output file will be gzipped
print 'populating users tweets!'
user.populate_tweets_from_api(json_output_filename=username_to_collect_data_for+".json",
                              sleep_var=False, is_gzip=False)
print 'user had {n_tweets} tweets'.format(n_tweets=len(user.tweets))

# we now will collect the user's followers
print 'populating user followers!'
user.populate_followers(sleep_var=False)
print 'user had {n_followers} followers!'.format(n_followers=len(user.follower_ids))




elif args.screen_name:
    print 'Running with screen name: ', args.screen_name
    args.json_file_or_folder = os.path.join(OUTPUT_DIR,args.screen_name+".json.gz")
    if os.path.exists(args.json_file_or_folder):
        print "User's tweets already in the system at: ", args.json_file_or_folder
    else:
        print "Getting user's tweets and saving to: ", args.json_file_or_folder
        if not args.path_to_twitter_credentials_file:
            print "Can't do anything with a screen name without some API credentials, see the help for this script " \
                  "and this parameter!"
            sys.exit(-1)

        app_handler = TwitterApplicationHandler(pathToConfigFile=args.path_to_twitter_credentials_file)
        user = TwitterUser(screen_name=args.screen_name,
                           api_hook=app_handler.api_hooks[0])
        user.populate_tweets_from_api(json_output_filename=args.json_file_or_folder,sleep_var=False)

########
# load the models and the files
########

print 'LOADING MODEL'
identity_model,feature_names = get_identity_model_and_features()

word_vector_model, all_dictionaries, ark_clusters, sets, names = get_init_data(GENSIM_MODEL_LOCATION,
                                                                               BROWN_CLUSTER_LOCATION)
print 'MODEL HAS BEEN LOADED'

def gen_json_for_tweets_of_interest(input_filename, output_filename,keep_only_tweets_with_terms=None):
    """
    This function generates a cleaned json file so that the identity
    def run(self):
        print ("Worker started")

        while True:

            try:
                data = self.queue.get(True)
                if data is None:
                    print "ALL DONE, EXITING!"
                    return

                user_id, screen_name = data[0], data[1]
                print ("Starting: ", screen_name, user_id)

                this_user_network_dir_name = os.path.join(self.network_dir, user_id)
                mkdir_no_err(this_user_network_dir_name)

                stored_user_list = set(
                    [os.path.basename(user_pickle) for user_pickle in glob.glob(self.pickle_dir + "*")]
                )

                # Get the ego
                if user_id in stored_user_list:
                    print ("\tgot pickled: ", user_id)
                    user = pickle.load(open(self.pickle_dir + "/" + str(user_id), "rb"))
                else:
                    user = TwitterUser(self.api_hook, user_id=user_id)
                    print ("\tgetting tweets for: ", user_id)
                    user.populate_tweets_from_api()
                    print ("\t num tweets received for: ", user_id, " (", screen_name, "): ", len(user.tweets))
                    if len(user.tweets) > 0:
                        print ("\tgetting lists, friends, followers for: ", user_id)
                        user.populate_lists_member_of()
                        # user.populate_followers()
                        # user.populate_friends()

                    print ("pickling: ", screen_name)
                    pickle.dump(user, open(self.pickle_dir + "/" + user_id, "wb"))

                self.write_user_network(this_user_network_dir_name, user, user_id, None)

                if len(user.tweets) == 0:
                    print ("finished collecting data for: ", user_id, ", no tweets")
                    continue

                # Find the ego network based on retweets, mentions and replies
                user_network_to_pull = user.get_ego_network_actors()

                print ("Starting to get ", user.user_id, "'s network of ", len(user_network_to_pull), " actors")
                restrict_to_users = [u for u in user_network_to_pull]
                restrict_to_users.append(user_id)

                self.get_user_network(
                    this_user_network_dir_name, user_network_to_pull, restrict_to_users, stored_user_list
                )
            except Exception:
                print ("FAILED:: ", data)
                exc_type, exc_value, exc_traceback = sys.exc_info()
                print ("*** print_tb:")
                traceback.print_tb(exc_traceback, limit=50, file=sys.stdout)

            print ("finished collecting data for: ", screen_name)
示例#11
0
    args.json_file_or_folder = os.path.join(OUTPUT_DIR,
                                            args.screen_name + ".json.gz")
    if os.path.exists(args.json_file_or_folder):
        print "User's tweets already in the system at: ", args.json_file_or_folder
    else:
        print "Getting user's tweets and saving to: ", args.json_file_or_folder
        if not args.path_to_twitter_credentials_file:
            print "Can't do anything with a screen name without some API credentials, see the help for this script " \
                  "and this parameter!"
            sys.exit(-1)

        app_handler = TwitterApplicationHandler(
            pathToConfigFile=args.path_to_twitter_credentials_file)
        user = TwitterUser(screen_name=args.screen_name,
                           api_hook=app_handler.api_hooks[0])
        user.populate_tweets_from_api(
            json_output_filename=args.json_file_or_folder, sleep_var=False)

########
# load the models and the files
########

print 'LOADING MODEL'
identity_model, feature_names = get_identity_model_and_features()

word_vector_model, all_dictionaries, ark_clusters, sets, names = get_init_data(
    GENSIM_MODEL_LOCATION, BROWN_CLUSTER_LOCATION)
print 'MODEL HAS BEEN LOADED'


def gen_json_for_tweets_of_interest(input_filename,
                                    output_filename,
    def run(self):
        print('Worker started')
        # do some initialization here
        snow_sample_number = None
        since_tweet_id = None
        while True:
            data = self.queue.get(True)

            try:
                if data is None:
                    print 'ALL FINISHED!!!!'
                    break

                if len(data) == 1 or type(data) is str or type(
                        data) is unicode or type(data) is int:
                    user_identifier = data
                elif len(data) == 3:
                    user_identifier, snow_sample_number, since_tweet_id = data
                elif len(data) == 2:
                    if self.step_count:
                        user_identifier, snow_sample_number = data
                    elif self.gets_since_tweet_id:
                        user_identifier, since_tweet_id = data

                user_identifier = str(user_identifier)

                print 'Starting: ', data

                pickle_filename = os.path.join(self.out_dir, "obj",
                                               user_identifier)
                json_filename = os.path.join(self.out_dir, "json",
                                             user_identifier + ".json.gz")

                # Get the user's data
                if os.path.exists(pickle_filename) and os.path.exists(
                        json_filename) and not self.add_to_file:
                    print '\tgot existing data for: ', data
                    user = pickle.load(open(pickle_filename, "rb"))
                    user.populate_tweets_from_file(json_filename)
                else:
                    if self.gets_user_id:
                        user = TwitterUser(self.api_hook,
                                           user_id=user_identifier)
                    else:
                        user = TwitterUser(self.api_hook,
                                           screen_name=user_identifier)

                    print 'populating tweets', user_identifier

                    if self.populate_tweets:
                        if self.save_user_tweets:
                            print 'saving tweets to: ', json_filename
                            of_name, tweet_count = user.populate_tweets_from_api(
                                json_output_filename=json_filename,
                                since_id=since_tweet_id,
                                populate_object_with_tweets=False)
                        else:
                            of_name, tweet_count = user.populate_tweets_from_api(
                                since_id=since_tweet_id,
                                populate_object_with_tweets=False)

                        if self.tweet_count_file:
                            self.tweet_count_file.write(
                                str(user_identifier) + "\t" +
                                str(tweet_count) + "\n")

                    if self.populate_lists:
                        print 'populating lists', user.screen_name
                        user.populate_lists_member_of()

                    if self.populate_friends:
                        print 'populating friends, ', user.screen_name
                        user.populate_friends()

                    if self.populate_followers:
                        print 'populating followers, ', user.screen_name
                        user.populate_followers()

                    if self.save_user_data and \
                        (self.always_pickle or self.populate_lists
                         or self.populate_friends or self.populate_followers):
                        # Pickle and dump user
                        #print 'pickling and dumping (no tweets): ', user.screen_name
                        user.tweets = []
                        pickle.dump(user, open(pickle_filename, "wb"))

                # now add to queue if necessary
                if snow_sample_number is not None and snow_sample_number < self.step_count:
                    for user_identifier in self.add_users_to_queue_function(
                            user):
                        self.queue.put(
                            [str(user_identifier), snow_sample_number + 1])

                if self.post_process_function:
                    self.post_process_function(user)

            except KeyboardInterrupt as e:
                print e
                break
            except Exception:
                print('FAILED:: ', data)
                exc_type, exc_value, exc_traceback = sys.exc_info()
                print("*** print_tb:")
                traceback.print_tb(exc_traceback, limit=30, file=sys.stdout)
                print("*** print_exception:")