예제 #1
0
#user screen names we are interested in
user_screenname_id_pairs = [line.strip().split("\t") for line in open(sys.argv[3]).readlines()]

print user_screenname_id_pairs[0]

pickle_dir = OUTPUT_DIRECTORY +"/obj/"
network_dir = OUTPUT_DIRECTORY+"/net/"

general_utils.mkdir_no_err(OUTPUT_DIRECTORY)
general_utils.mkdir_no_err(pickle_dir)
general_utils.mkdir_no_err(network_dir)

multiprocess_setup.init_good_sync_manager()

##put data on the queue
request_queue = multiprocess_setup.load_request_queue(user_screenname_id_pairs, len(handles))


processes = []
for i in range(len(handles)):
    p = TwitterEgoNetworkWorker(request_queue, handles[i], network_dir, pickle_dir)
    p.start()
    processes.append(p)

try:
    for p in processes:
        p.join()
except KeyboardInterrupt:
    print 'keyboard interrupt'


# Create the output directory
mkdir_no_err(output_dir)

# chunk tweets into 100s (the API takes them by 100)
i = 0
tweets_chunked = chunk_data(tweet_ids)


print tweets_chunked[0]
# init a sync manager
multiprocess_setup.init_good_sync_manager()

# put data on the queue
request_queue = multiprocess_setup.load_request_queue(tweets_chunked, len(handles))
# run!
processes = []
for i in range(len(handles)):
    p = TweetDataWorker(request_queue,handles[i],i,output_dir)
    p.start()
    processes.append(p)

try:
    for p in processes:
        p.join()
except KeyboardInterrupt:
    print 'keyboard interrupt'


# get all the handles we have to the api
handles = general_utils.get_handles(glob.glob(os.path.join(sys.argv[1],"*.txt")))

print len(handles)
print 'n authed users: ', len(handles)


# user screen names we are interested in
user_sns = ['ManchesterMJC','Aviationeuro','SanteriSanttus','OneworldLover',
            'ENORsquawker','plane_spotters','MANSpotter99','BennyPlanespot','PlanespotterGuy','planespotterWal']

user_screenname_id_pairs = get_user_ids_and_sn_data_from_list(user_sns,handles,True)
print 'got screen names, ', len(user_screenname_id_pairs)

# put data on the queue
request_queue = multiprocess_setup.load_request_queue(
         [(x[1],0) for x in user_screenname_id_pairs], len(handles), add_nones=False)

pickle_dir = OUTPUT_DIRECTORY +"/obj/"
network_dir = OUTPUT_DIRECTORY+"/json/"

general_utils.mkdir_no_err(OUTPUT_DIRECTORY)
general_utils.mkdir_no_err(pickle_dir)
general_utils.mkdir_no_err(network_dir)

multiprocess_setup.init_good_sync_manager()

# put data on the queue
user_screenname_id_pairs = get_user_ids_and_sn_data_from_list(user_sns,handles,True)
print 'got screen names, ', len(user_screenname_id_pairs)

# put data on the queue
user_ids = [u for u in user_ids]

user_data_chunked = []
i=0
while i < len(user_ids):
    user_data_chunked.append(user_ids[i:(i+100)])
    i += 100

user_data_chunked.append(user_ids[i-100:len(user_ids)])

print 'len chunked: ', len(user_data_chunked)

multiprocess_setup.init_good_sync_manager()

##put data on the queue
request_queue = multiprocess_setup.load_request_queue([x for x in user_data_chunked], len(handles), add_nones=True)

processes = []
for i in range(len(handles)):
    p = SimpleUserLookupWorker(request_queue,handles[i],i, out_dir)
    p.start()
    processes.append(p)

try:
    for p in processes:
        p.join()
except KeyboardInterrupt:
    print 'keyboard interrupt'

from twitter_dm.utility import general_utils

from twitter_dm.utility.general_utils import mkdir_no_err, collect_system_arguments, chunk_data

handles, output_dir, tweet_ids, is_ids = collect_system_arguments(sys.argv)

# Create the output directory
mkdir_no_err(output_dir)

# chunk tweets into 100s (the API takes them by 100)
i = 0
tweets_chunked = chunk_data(tweet_ids)

# init a sync manager
multiprocess_setup.init_good_sync_manager()

# put data on the queue
request_queue = multiprocess_setup.load_request_queue(tweet_ids, len(handles))
# run!
processes = []
for i in range(len(handles)):
    p = TweetDataWorker(request_queue, handles[i], i, output_dir)
    p.start()
    processes.append(p)

try:
    for p in processes:
        p.join()
except KeyboardInterrupt:
    print 'keyboard interrupt'
    user_ids = [line.strip().split(",")[0] for line in open(sys.argv[3]).readlines()]

    print "num users: ", len(user_ids)

    general_utils.mkdir_no_err(out_dir)
    general_utils.mkdir_no_err(os.path.join(out_dir, "obj"))
    general_utils.mkdir_no_err(os.path.join(out_dir, "json"))
    multiprocess_setup.init_good_sync_manager()

    # already_done = set([os.path.basename(f) for f in glob.glob(out_dir+"/*")])
    print "len already done:", 0
    # user_screennames = [u for u in user_screennames if u not in already_done]

    ##put data on the queue
    request_queue = multiprocess_setup.load_request_queue(user_ids, len(handles))

    processes = []
    for i in range(len(handles)):
        p = UserDataWorker(
            request_queue,
            handles[i],
            out_dir,
            always_pickle=True,
            gets_user_id=False,
            populate_lists=False,
            populate_friends=True,
            populate_followers=True,
        )
        p.start()
        processes.append(p)
print 'n authed users: ', len(handles)

# user screen names we are interested in
user_sns = [
    'ManchesterMJC', 'Aviationeuro', 'SanteriSanttus', 'OneworldLover',
    'ENORsquawker', 'plane_spotters', 'MANSpotter99', 'BennyPlanespot',
    'PlanespotterGuy', 'planespotterWal'
]

user_screenname_id_pairs = get_user_ids_and_sn_data_from_list(
    user_sns, handles, True)
print 'got screen names, ', len(user_screenname_id_pairs)

# put data on the queue
request_queue = multiprocess_setup.load_request_queue(
    [(x[1], 0) for x in user_screenname_id_pairs],
    len(handles),
    add_nones=False)

pickle_dir = OUTPUT_DIRECTORY + "/obj/"
network_dir = OUTPUT_DIRECTORY + "/json/"

general_utils.mkdir_no_err(OUTPUT_DIRECTORY)
general_utils.mkdir_no_err(pickle_dir)
general_utils.mkdir_no_err(network_dir)

multiprocess_setup.init_good_sync_manager()

# put data on the queue
user_screenname_id_pairs = get_user_ids_and_sn_data_from_list(
    user_sns, handles, True)
print 'got screen names, ', len(user_screenname_id_pairs)
user_id_and_since_id = []
for line in open(sys.argv[3]).readlines():
    line_spl = line.strip().split(",")
    if line_spl[1] == 'None':
        continue
    user_id_and_since_id.append((line_spl[0],line_spl[1]))

print 'num users: ', len(user_id_and_since_id)

general_utils.mkdir_no_err(out_dir)
general_utils.mkdir_no_err(os.path.join(out_dir,"obj"))
general_utils.mkdir_no_err(os.path.join(out_dir,"json"))
multiprocess_setup.init_good_sync_manager()

##put data on the queue
request_queue = multiprocess_setup.load_request_queue(user_id_and_since_id, len(handles))

processes = []
for i in range(len(handles)):
    p = UserDataWorker(request_queue,handles[i],out_dir,
                        always_pickle=True,
                        gets_user_id=False,
                        populate_lists=False,
                        populate_friends=True,
                        populate_followers=True,
                        gets_since_tweet_id=True)
    p.start()
    processes.append(p)

try:
    for p in processes:
예제 #9
0
from twitter_dm.multiprocess import multiprocess_setup
from twitter_dm.multiprocess.WorkerSimpleUserLookup import SimpleUserLookupWorker
from twitter_dm.utility import general_utils

handles, out_dir, data_to_collect, is_ids = collect_system_arguments(sys.argv)

general_utils.mkdir_no_err(out_dir)

user_data_chunked = chunk_data(data_to_collect)
print 'len chunked: ', len(user_data_chunked)

# initialize a better sync manager
multiprocess_setup.init_good_sync_manager()

# put data on the queue
request_queue = multiprocess_setup.load_request_queue(
    [x for x in user_data_chunked], len(handles), add_nones=True)

processes = []
for i in range(len(handles)):
    p = SimpleUserLookupWorker(request_queue,
                               handles[i],
                               i,
                               out_dir,
                               gets_user_id=is_ids)
    p.start()
    processes.append(p)

try:
    for p in processes:
        p.join()
except KeyboardInterrupt:
import sys

from twitter_dm.utility.general_utils import collect_system_arguments
from twitter_dm.multiprocess import multiprocess_setup
from twitter_dm.multiprocess.WorkerBotOMeter import BotOMeterWorker
from twitter_dm.utility import general_utils

handles, out_dir, data_to_collect, is_ids, mashape_key = collect_system_arguments(
    sys.argv, ['mashape_key'])

general_utils.mkdir_no_err(out_dir)

# initialize a better sync manager
multiprocess_setup.init_good_sync_manager()

# put data on the queue
request_queue = multiprocess_setup.load_request_queue(
    [x.strip() for x in data_to_collect], len(handles), add_nones=True)

processes = []
for i in range(len(handles)):
    p = BotOMeterWorker(request_queue, handles[i], i, out_dir, mashape_key)
    p.start()
    processes.append(p)

try:
    for p in processes:
        p.join()
except KeyboardInterrupt:
    print 'keyboard interrupt'