def main(): #sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) args, extras = get_args() processes = [] shutdown_counter = 0 while True: for wids in iterate_wids_from_args(args): shutdown_counter = 0 print "Working on %d wids" % len(wids) while len(wids) > 0: while len(processes) < 8: if wids: wid = wids.pop() print 'Launching child to process %s' % wid cmdstring = ( ('/usr/bin/python -m ' + 'wikia_dstk.pipeline.wiki_data_extraction.child' + ' --wiki-id=%s %s') % ( str(wid), argstring_from_namespace(args, extras))) processes.append(Popen(cmdstring, shell=True)) else: print 'No more wiki IDs to iterate over' break processes = filter(lambda x: x.poll() is None, processes) sleep(5) if len(processes) > 0: print len(processes), "processes still running" processes = filter(lambda x: x.poll() is None, processes) sleep(30) else: shutdown_counter += 1 if shutdown_counter == 10: print ("Waited five minutes with nothing in the queue, " + "shutting down") current_id = get_instance_metadata()['instance-id'] ec2_conn = connect_to_region(args.region) ec2_conn.terminate_instances([current_id]) sleep(30)
def main(): #sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) args, extras = get_args() processes = [] shutdown_counter = 0 while True: for wids in iterate_wids_from_args(args): shutdown_counter = 0 print "Working on %d wids" % len(wids) while len(wids) > 0: while len(processes) < 8: if wids: wid = wids.pop() print 'Launching child to process %s' % wid cmdstring = ( ('/usr/bin/python -m ' + 'wikia_dstk.pipeline.wiki_data_extraction.child' + ' --wiki-id=%s %s') % (str(wid), argstring_from_namespace(args, extras))) processes.append(Popen(cmdstring, shell=True)) else: print 'No more wiki IDs to iterate over' break processes = filter(lambda x: x.poll() is None, processes) sleep(5) if len(processes) > 0: print len(processes), "processes still running" processes = filter(lambda x: x.poll() is None, processes) sleep(30) else: shutdown_counter += 1 if shutdown_counter == 10: print("Waited five minutes with nothing in the queue, " + "shutting down") current_id = get_instance_metadata()['instance-id'] ec2_conn = connect_to_region(args.region) ec2_conn.terminate_instances([current_id]) sleep(30)
def main(): args, extras = get_args() if args.all: articles, wids = execute_all(args) else: articles, wids = execute_old(args) # Launch EC2 instances with appropriate shell scripts callback = lambda x: articles.get(x, 0) num_instances = config['max_size'] user_data = """#!/bin/sh cd /home/ubuntu/nlp_services git fetch origin git checkout master git pull origin master && sudo python setup.py install cd /home/ubuntu/data-science-toolkit git fetch origin git checkout {git_ref} git pull origin {git_ref} && sudo python setup.py install cd /home/ubuntu python -m wikia_dstk.pipeline.wiki_data_extraction.run --s3path={{key}} {argstring} 2>&1 | tee -a /home/ubuntu/wiki_data_extraction.log""".format(git_ref=args.git_ref, argstring=argstring_from_namespace(args, extras)) #python -m wikia_dstk.pipeline.wiki_data_extraction.test_log &> /home/ubuntu/test.log""".format(git_ref=args.git_ref) instances = run_instances_lb( wids, callback, num_instances, user_data, config) instance_ids = [i for i in instances.get() for i in i] conn = connect_to_region('us-west-2') conn.create_tags(instance_ids, {'Name': args.tag, 'type': 'wiki_data_extraction'}) print 'The following instances have been launched: %s' % str(instance_ids)