Пример #1
0
def main():
    #sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
    args, extras = get_args()
    processes = []
    shutdown_counter = 0
    while True:
        for wids in iterate_wids_from_args(args):
            shutdown_counter = 0
            print "Working on %d wids" % len(wids)
            while len(wids) > 0:
                while len(processes) < 8:
                    if wids:
                        wid = wids.pop()
                        print 'Launching child to process %s' % wid
                        cmdstring = (
                            ('/usr/bin/python -m ' +
                             'wikia_dstk.pipeline.wiki_data_extraction.child' +
                             ' --wiki-id=%s %s') % (
                                str(wid), argstring_from_namespace(args,
                                                                   extras)))
                        processes.append(Popen(cmdstring, shell=True))
                    else:
                        print 'No more wiki IDs to iterate over'
                        break

                processes = filter(lambda x: x.poll() is None, processes)
                sleep(5)

        if len(processes) > 0:
            print len(processes), "processes still running"
            processes = filter(lambda x: x.poll() is None, processes)
            sleep(30)
        else:
            shutdown_counter += 1
            if shutdown_counter == 10:
                print ("Waited five minutes with nothing in the queue, " +
                       "shutting down")
                current_id = get_instance_metadata()['instance-id']
                ec2_conn = connect_to_region(args.region)
                ec2_conn.terminate_instances([current_id])
            sleep(30)
Пример #2
0
def main():
    #sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
    args, extras = get_args()
    processes = []
    shutdown_counter = 0
    while True:
        for wids in iterate_wids_from_args(args):
            shutdown_counter = 0
            print "Working on %d wids" % len(wids)
            while len(wids) > 0:
                while len(processes) < 8:
                    if wids:
                        wid = wids.pop()
                        print 'Launching child to process %s' % wid
                        cmdstring = (
                            ('/usr/bin/python -m ' +
                             'wikia_dstk.pipeline.wiki_data_extraction.child' +
                             ' --wiki-id=%s %s') %
                            (str(wid), argstring_from_namespace(args, extras)))
                        processes.append(Popen(cmdstring, shell=True))
                    else:
                        print 'No more wiki IDs to iterate over'
                        break

                processes = filter(lambda x: x.poll() is None, processes)
                sleep(5)

        if len(processes) > 0:
            print len(processes), "processes still running"
            processes = filter(lambda x: x.poll() is None, processes)
            sleep(30)
        else:
            shutdown_counter += 1
            if shutdown_counter == 10:
                print("Waited five minutes with nothing in the queue, " +
                      "shutting down")
                current_id = get_instance_metadata()['instance-id']
                ec2_conn = connect_to_region(args.region)
                ec2_conn.terminate_instances([current_id])
            sleep(30)
Пример #3
0
def main():
    args, extras = get_args()

    if args.all:
        articles, wids = execute_all(args)
    else:
        articles, wids = execute_old(args)

    # Launch EC2 instances with appropriate shell scripts
    callback = lambda x: articles.get(x, 0)
    num_instances = config['max_size']
    user_data = """#!/bin/sh
cd /home/ubuntu/nlp_services
git fetch origin
git checkout master
git pull origin master && sudo python setup.py install
cd /home/ubuntu/data-science-toolkit
git fetch origin
git checkout {git_ref}
git pull origin {git_ref} && sudo python setup.py install
cd /home/ubuntu
python -m wikia_dstk.pipeline.wiki_data_extraction.run --s3path={{key}} {argstring} 2>&1 | tee -a /home/ubuntu/wiki_data_extraction.log""".format(git_ref=args.git_ref, argstring=argstring_from_namespace(args, extras))
#python -m wikia_dstk.pipeline.wiki_data_extraction.test_log &> /home/ubuntu/test.log""".format(git_ref=args.git_ref)
    instances = run_instances_lb(
        wids, callback, num_instances, user_data, config)
    instance_ids = [i for i in instances.get() for i in i]
    conn = connect_to_region('us-west-2')
    conn.create_tags(instance_ids, {'Name': args.tag, 'type': 'wiki_data_extraction'})
    print 'The following instances have been launched: %s' % str(instance_ids)