Пример #1
0
    def __init__(self, shockurl, arasturl, config, threads, queue, kill_queue,
                 job_list, ctrl_conf):
        self.parser = SafeConfigParser()
        self.parser.read(config)
        self.job_list = job_list
        # Load plugins
        self.pmanager = ModuleManager(threads, kill_queue, job_list)

        # Set up environment
        self.shockurl = shockurl
        self.arasturl = arasturl
        self.datapath = self.parser.get('compute', 'datapath')
        if queue:
            self.queue = queue
            print('Using queue:{}'.format(self.queue))
        else:
            self.queue = self.parser.get('rabbitmq', 'default_routing_key')
        self.min_free_space = float(
            self.parser.get('compute', 'min_free_space'))
        m = ctrl_conf['meta']
        a = ctrl_conf['assembly']

        self.metadata = meta.MetadataConnection(arasturl, int(a['mongo_port']),
                                                m['mongo.db'],
                                                m['mongo.collection'],
                                                m['mongo.collection.auth'])
        self.gc_lock = multiprocessing.Lock()
Пример #2
0
    def __init__(self, shockurl, arasturl, config, threads, queue, kill_queue, job_list, ctrl_conf, datapath, binpath):
        self.parser = SafeConfigParser()
        self.parser.read(config)
        self.job_list = job_list
        # Load plugins
        self.pmanager = ModuleManager(threads, kill_queue, job_list, binpath)

    # Set up environment
        self.shockurl = shockurl
        self.arasturl = arasturl
        self.datapath = datapath
        if queue:
            self.queue = queue
            logging.info('Using queue:{}'.format(self.queue))
        else:
            self.queue = self.parser.get('rabbitmq','default_routing_key')
        self.min_free_space = float(self.parser.get('compute','min_free_space'))
        m = ctrl_conf['meta']        
        a = ctrl_conf['assembly']
        

        ###### TODO Use REST API
        self.metadata = meta.MetadataConnection(arasturl, int(a['mongo_port']), m['mongo.db'],
                                                m['mongo.collection'], m['mongo.collection.auth'], m['mongo.collection.data'] )
        self.gc_lock = multiprocessing.Lock()
Пример #3
0
    def __init__(self, shockurl, rmq_host, rmq_port, mongo_host, mongo_port, config, threads, queue,
                 kill_list, kill_list_lock, job_list, job_list_lock, ctrl_conf, datapath, binpath):
        self.parser = SafeConfigParser()
        self.parser.read(config)
        self.kill_list = kill_list
        self.kill_list_lock = kill_list_lock
        self.job_list = job_list
        self.job_list_lock = job_list_lock
        # Load plugins
        self.pmanager = ModuleManager(threads, kill_list, kill_list_lock, job_list, binpath)

    # Set up environment
        self.shockurl = shockurl
        self.datapath = datapath
        self.rmq_host = rmq_host
        self.rmq_port = rmq_port
        self.mongo_host = mongo_host
        self.mongo_port = mongo_port
        self.queue = queue
        self.min_free_space = float(self.parser.get('compute','min_free_space'))
        self.data_expiration_days = float(self.parser.get('compute','data_expiration_days'))
        m = ctrl_conf['meta']
        a = ctrl_conf['assembly']

        collections = {'jobs': m.get('mongo.collection', 'jobs'),
                       'auth': m.get('mongo.collection.auth', 'auth'),
                       'data': m.get('mongo.collection.data', 'data'),
                       'running': m.get('mongo.collection.running', 'running_jobs')}

        ###### TODO Use REST API
        self.metadata = meta.MetadataConnection(self.mongo_host, self.mongo_port, m['mongo.db'],
                                                collections)
        self.gc_lock = multiprocessing.Lock()
Пример #4
0
    def __init__(self, threads, datapath, binpath, modulebin):

        self.threads = threads
        self.binpath = binpath
        self.modulebin = modulebin
        self.pmanager = ModuleManager(threads, None, None, None, binpath, modulebin)

        self.datapath = datapath
Пример #5
0
    def __init__(self, shockurl, rmq_host, rmq_port, mongo_host, mongo_port,
                 config, threads, queues, kill_list, kill_list_lock, job_list,
                 job_list_lock, ctrl_conf, datapath, binpath, modulebin):
        self.parser = SafeConfigParser()
        self.parser.read(config)
        self.kill_list = kill_list
        self.kill_list_lock = kill_list_lock
        self.job_list = job_list
        self.job_list_lock = job_list_lock
        # Load plugins
        self.threads = threads
        self.binpath = binpath
        self.modulebin = modulebin
        self.pmanager = ModuleManager(threads, kill_list, kill_list_lock,
                                      job_list, binpath, modulebin)

        # Set up environment
        self.shockurl = shockurl
        self.datapath = datapath
        self.rmq_host = rmq_host
        self.rmq_port = rmq_port
        self.mongo_host = mongo_host
        self.mongo_port = mongo_port
        self.queues = queues
        self.min_free_space = float(
            self.parser.get('compute', 'min_free_space'))
        self.data_expiration_days = float(
            self.parser.get('compute', 'data_expiration_days'))
        m = ctrl_conf['meta']
        a = ctrl_conf['assembly']

        collections = {
            'jobs': m.get('mongo.collection', 'jobs'),
            'auth': m.get('mongo.collection.auth', 'auth'),
            'data': m.get('mongo.collection.data', 'data'),
            'running': m.get('mongo.collection.running', 'running_jobs')
        }

        ###### TODO Use REST API
        self.metadata = meta.MetadataConnection(self.mongo_host,
                                                self.mongo_port, m['mongo.db'],
                                                collections)
        self.gc_lock = multiprocessing.Lock()
Пример #6
0
class ArastStandalone:
    def __init__(self, threads, datapath, binpath, modulebin):

        self.threads = threads
        self.binpath = binpath
        self.modulebin = modulebin
        self.pmanager = ModuleManager(threads, None, None, None, binpath, modulebin)

        self.datapath = datapath

    def compute(self, jobpath, input_description):

        try:
            os.makedirs(jobpath)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise

        pipelines = input_description['pipelines']
        recipe = input_description['recipe']
        wasp_in = input_description['wasp_in']

        ### Create job log
        self.out_report_name = '{}/{}_report.txt'.format(jobpath, str(input_description['job_id']))
        self.out_report = open(self.out_report_name, 'w')

        job_id = input_description['job_id']

        # create job data (ArastJob object)

        #
        # input_description is dictionary containing three
        # input sets: reads, reference, and contigs.
        # Each contains a list of fileinfo objects.
        #
        # It also contains fields user, containing the end system's username,
        # and job_id, a job_id allocated by the end system.
        # 

        uid = str(uuid.uuid4())

        #
        # We need to populate the files list in each of the filesets.
        #
        print input_description
        for sub in ['reads', 'reference', 'contigs']:
            l = input_description[sub]
            for fs in l:
                print sub, fs
                fs['files'] = []
                for x in fs['fileinfos']:
                    fs['files'].append(x['local_file'])
        print input_description

        job_data = ArastJob({'job_id' : job_id,
                             'uid': uid,
                             'user' : input_description['user'],
                             'reads' : input_description['reads'],
                             'logfiles': [],
                             'reference': input_description['reference'],
                             'contigs': input_description['contigs'],
                             'initial_reads': list(input_description['reads']),
                             'raw_reads': copy.deepcopy(input_description['reads']),
                             'params' : [],
                             'exceptions' : [],
                             'pipeline_data' : {},
                             'out_report': self.out_report,
                             'datapath': self.datapath
                             })
                             
        
        status = ''
        logger.debug('job_data = {}'.format(job_data))

        self.start_time = time.time()

        #### Parse pipeline to wasp exp
        reload(recipes)
        if recipe:
            try: wasp_exp = recipes.get(recipe[0], job_id)
            except AttributeError: raise Exception('"{}" recipe not found.'.format(recipe[0]))
        elif wasp_in:
            wasp_exp = wasp_in[0]
        elif not pipelines:
            wasp_exp = recipes.get('auto', job_id)
        elif pipelines:
            ## Legacy client
            if pipelines[0] == 'auto':
                wasp_exp = recipes.get('auto', job_id)
            ##########
            else:
                if type(pipelines[0]) is not list: # --assemblers
                    pipelines = [pipelines]
                all_pipes = []
                for p in pipelines:
                    all_pipes += self.pmanager.parse_input(p)
                logger.debug("pipelines = {}".format(all_pipes))
                wasp_exp = wasp.pipelines_to_exp(all_pipes, params['job_id'])
        else:
            raise asmtypes.ArastClientRequestError('Malformed job request.')
        logger.debug('Wasp Expression: {}'.format(wasp_exp))
        w_engine = wasp.WaspEngine(self.pmanager, job_data)

        ###### Run Job
        try:
            w_engine.run_expression(wasp_exp, job_data)
            ###### Upload all result files and place them into appropriate tags

            print "Done - job data: " , pprint.pformat(job_data)
            # uploaded_fsets = job_data.upload_results(url, token)

            # Format report
            new_report = open('{}.tmp'.format(self.out_report_name), 'w')

            ### Log errors
            if len(job_data['errors']) > 0:
                new_report.write('PIPELINE ERRORS\n')
                for i,e in enumerate(job_data['errors']):
                    new_report.write('{}: {}\n'.format(i, e))
            try: ## Get Quast output
                quast_report = job_data['wasp_chain'].find_module('quast')['data'].find_type('report')[0].files[0]
                with open(quast_report) as q:
                    new_report.write(q.read())
            except:
                new_report.write('No Summary File Generated!\n\n\n')
            self.out_report.close()
            with open(self.out_report_name) as old:
                new_report.write(old.read())

            for log in job_data['logfiles']:
                new_report.write('\n{1} {0} {1}\n'.format(os.path.basename(log), '='*20))
                with open(log) as l:
                    new_report.write(l.read())

            ### Log tracebacks
            if len(job_data['tracebacks']) > 0:
                new_report.write('EXCEPTION TRACEBACKS\n')
                for i,e in enumerate(job_data['tracebacks']):
                    new_report.write('{}: {}\n'.format(i, e))

            new_report.close()
            os.remove(self.out_report_name)
            shutil.move(new_report.name, self.out_report_name)
            # res = self.upload(url, user, token, self.out_report_name)
            print "Would upload ", self.out_report_name
            # report_info = asmtypes.FileInfo(self.out_report_name, shock_url=url, shock_id=res['data']['id'])

            status = 'Complete with errors' if job_data.get('errors') else 'Complete'

            ## Make compatible with JSON dumps()
            del job_data['out_report']
            del job_data['initial_reads']
            del job_data['raw_reads']
            #
            # Write this somewhere
            # self.metadata.update_job(uid, 'data', job_data)
            # self.metadata.update_job(uid, 'result_data', uploaded_fsets)

            sys.stdout.flush()
            touch(os.path.join(jobpath, "_DONE_"))
            logger.info('============== JOB COMPLETE ===============')

        except asmtypes.ArastUserInterrupt:
            status = 'Terminated by user'
            sys.stdout.flush()
            touch(os.path.join(jobpath, "_CANCELLED__"))
            logger.info('============== JOB KILLED ===============')
Пример #7
0
class ArastConsumer:
    def __init__(self, shockurl, rmq_host, rmq_port, mongo_host, mongo_port, config, threads, queues,
                 kill_list, kill_list_lock, job_list, job_list_lock, ctrl_conf, datapath, binpath, modulebin):
        self.parser = SafeConfigParser()
        self.parser.read(config)
        self.kill_list = kill_list
        self.kill_list_lock = kill_list_lock
        self.job_list = job_list
        self.job_list_lock = job_list_lock
        # Load plugins
        self.threads = threads
        self.binpath = binpath
        self.modulebin = modulebin
        self.pmanager = ModuleManager(threads, kill_list, kill_list_lock, job_list, binpath, modulebin)

        # Set up environment
        self.shockurl = shockurl
        self.datapath = datapath
        self.rmq_host = rmq_host
        self.rmq_port = rmq_port
        self.mongo_host = mongo_host
        self.mongo_port = mongo_port
        self.queues = queues
        self.min_free_space = float(self.parser.get('compute','min_free_space'))
        self.data_expiration_days = float(self.parser.get('compute','data_expiration_days'))
        m = ctrl_conf['meta']
        a = ctrl_conf['assembly']

        collections = {'jobs': m.get('mongo.collection', 'jobs'),
                       'auth': m.get('mongo.collection.auth', 'auth'),
                       'data': m.get('mongo.collection.data', 'data'),
                       'running': m.get('mongo.collection.running', 'running_jobs')}

        ###### TODO Use REST API
        self.metadata = meta.MetadataConnection(self.mongo_host, self.mongo_port, m['mongo.db'],
                                                collections)
        self.gc_lock = multiprocessing.Lock()

    def garbage_collect(self, datapath, required_space, user, job_id, data_id):
        """ Monitor space of disk containing DATAPATH and delete files if necessary."""
        datapath = self.datapath
        required_space = self.min_free_space
        expiration = self.data_expiration_days

        ### Remove expired directories
        def can_remove(d, user, job_id, data_id):
            u, data, j = d.split('/')[-4:-1]
            if u == user and j == str(job_id):
                return False
            if data == str(data_id) and j == 'raw':
                return False
            if os.path.isdir(d):
                return True
            return False

        dir_depth = 3
        dirs = filter(lambda f: can_remove(f, user, job_id, data_id), glob.glob(datapath + '/' + '*/' * dir_depth))
        removed = []
        logger.info('Searching for directories older than {} days'.format(expiration))
        for d in dirs:
            file_modified = None
            try:
                file_modified = datetime.datetime.fromtimestamp(os.path.getmtime(d))
            except os.error as e:
                logger.warning('GC ignored "{}": could not get timestamp: {}'.format(d, e))
                continue
            tdiff = datetime.datetime.now() - file_modified
            if tdiff > datetime.timedelta(days=expiration):
                logger.info('GC: removing expired directory: {} (modified {} ago)'.format(d, tdiff))
                removed.append(d)
                shutil.rmtree(d, ignore_errors=True)
            else:
                logger.debug('GC: not removing: {} (modified {} ago)'.format(d, tdiff))
        for r in removed:
            dirs.remove(r)

        ### Check free space and remove old directories
        free_space = free_space_in_path(datapath)
        logger.info("Required space in GB: {} (free = {})".format(required_space, free_space))

        times = []
        for d in dirs:
            try:
                t = os.path.getmtime(d)
                times.append([t, d])
            except:
                pass
        times.sort()
        logger.debug("Directories sorted by time: {}".format(times))
        dirs = [x[1] for x in times]

        busy_dirs = []
        while free_space < self.min_free_space and len(dirs) > 0:
            d = dirs.pop(0)
            if is_dir_busy(d):
                busy_dirs.append(d)
            else:
                free_space = self.remove_dir(d)

        while free_space < self.min_free_space:
            if len(busy_dirs) == 0:
                logger.error("GC: free space {} < {} GB; waiting for system space to be available...".format(free_space, self.min_free_space))
                time.sleep(60)
            else:
                logger.warning("GC: free space {} < {} GB; waiting for jobs to complete to reclaim space: {} busy directories..."
                               .format(free_space, self.min_free_space, len(busy_dirs)))
                checked_dirs = []
                while free_space < self.min_free_space and len(busy_dirs) > 0:
                    bd = busy_dirs.pop(0)
                    if is_dir_busy(bd):
                        checked_dirs.append(bd)
                        continue
                    free_space = self.remove_dir(bd)
                    # self.remove_empty_dirs()
                if free_space < self.min_free_space:
                    busy_dirs = checked_dirs
                    time.sleep(20)
            free_space = free_space_in_path(self.datapath)

        self.remove_empty_dirs()


    def remove_dir(self, d):
        shutil.rmtree(d, ignore_errors=True)
        logger.info("GC: space required; %s removed." % d)
        return free_space_in_path(self.datapath)

    def remove_empty_dirs(self):
        data_dirs = filter(lambda f: os.path.isdir(f), glob.glob(self.datapath + '/' + '*/' * 2))
        for dd in data_dirs:
            if not os.listdir(dd):
                logger.info('GC: removing empty directory: {}'.format(dd))
                try:
                    os.rmdir(dd)
                except os.error as e:
                    logger.warning('GC: could not remove empty dir "{}": {}'.format(dd, e))

    def get_data(self, body):
        """Get data from cache or Shock server."""
        params = json.loads(body)
        logger.debug('New Data Format')
        return self._get_data(body)

    def _get_data(self, body):
        params = json.loads(body)
        filepath = os.path.join(self.datapath, params['ARASTUSER'],
                                str(params['data_id']))
        datapath = filepath
        filepath += "/raw/"
        all_files = []
        user = params['ARASTUSER']
        job_id = params['job_id']
        data_id = params['data_id']
        token = params['oauth_token']
        uid = params['_id']

        self.gc_lock.acquire()
        try:
            self.garbage_collect(self.datapath, self.min_free_space, user, job_id, data_id)
        except:
            logger.error('Unexpected error in GC.')
            raise
        finally:
            self.gc_lock.release()

        ##### Get data from ID #####
        data_doc = self.metadata.get_data_docs(params['ARASTUSER'], params['data_id'])
        if not data_doc:
            raise Exception('Invalid Data ID: {}'.format(params['data_id']))
        logger.debug('data_doc = {}'.format(data_doc))
        if 'kbase_assembly_input' in data_doc:
            params['assembly_data'] = kb_to_asm(data_doc['kbase_assembly_input'])
        elif 'assembly_data' in data_doc:
            params['assembly_data'] = data_doc['assembly_data']

        ##### Get data from assembly_data #####
        self.metadata.update_job(uid, 'status', 'Data transfer')
        with ignored(OSError):
            os.makedirs(filepath)
            touch(filepath)

        file_sets = params['assembly_data']['file_sets']
        for file_set in file_sets:
            if file_set['type'] == 'paired_url':
                file_set['type'] = 'paired'
            elif file_set['type'] == 'single_url':
                file_set['type'] = 'single'
            elif file_set['type'] == 'reference_url':
                file_set['type'] = 'reference'
            file_set['files'] = [] #legacy
            for file_info in file_set['file_infos']:
                #### File is stored on Shock
                if file_info['filename']:
                    local_file = os.path.join(filepath, file_info['filename'])
                    if os.path.exists(local_file):
                        local_file = self.extract_file(local_file)
                        logger.info("Requested data exists on node: {}".format(local_file))
                    else:
                        local_file = self.download_shock(file_info['shock_url'], user, token,
                                                   file_info['shock_id'], filepath)

                elif file_info['direct_url']:
                    local_file = os.path.join(filepath, os.path.basename(file_info['direct_url']))
                    if os.path.exists(local_file):
                        local_file = self.extract_file(local_file)
                        logger.info("Requested data exists on node: {}".format(local_file))
                    else:
                        local_file = self.download_url(file_info['direct_url'], filepath, token=token)
                file_info['local_file'] = local_file
                if file_set['type'] == 'single' and asm.is_long_read_file(local_file):
                    if not 'tags' in file_set:
                        file_set['tags'] = []
                    if not 'long_read' in file_set['tags']:
                        file_set['tags'].append('long_read') # pacbio or nanopore reads
                file_set['files'].append(local_file) #legacy
            all_files.append(file_set)
        return datapath, all_files


    def prepare_job_data(self, body):
        params = json.loads(body)
        job_id = params['job_id']

        ### Download files (if necessary)
        datapath, all_files = self.get_data(body)
        rawpath = datapath + '/raw/'
        jobpath = os.path.join(datapath, str(job_id))

        try:
            os.makedirs(jobpath)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise

        ### Protect data directory from GC before any job starts
        touch(os.path.join(rawpath, "_READY_"))

        ### Create job log
        self.out_report_name = '{}/{}_report.txt'.format(jobpath, str(job_id))
        self.out_report = open(self.out_report_name, 'w')

        ### Create data to pass to pipeline
        reads = []
        reference = []
        contigs = []
        for fileset in all_files:
            if len(fileset['files']) != 0:
                if (fileset['type'] == 'single' or
                    fileset['type'] == 'paired'):
                    reads.append(fileset)
                elif fileset['type'] == 'reference':
                    reference.append(fileset)
                elif fileset['type'] == 'contigs':
                    contigs.append(fileset)
                else:
                    raise Exception('fileset error')

        job_data = ArastJob({'job_id' : params['job_id'],
                    'uid' : params['_id'],
                    'user' : params['ARASTUSER'],
                    'reads': reads,
                    'logfiles': [],
                    'reference': reference,
                    'contigs': contigs,
                    'initial_reads': list(reads),
                    'raw_reads': copy.deepcopy(reads),
                    'params': [],
                    'exceptions': [],
                    'pipeline_data': {},
                    'datapath': datapath,
                    'out_report' : self.out_report})

        self.out_report.write("Arast Pipeline: Job {}\n".format(job_id))

        return job_data


    def compute(self, body):
        self.job_list_lock.acquire()
        try:
            job_data = self.prepare_job_data(body)
            self.job_list.append(job_data)
        except:
            logger.error("Error in adding new job to job_list")
            raise
        finally:
            self.job_list_lock.release()

        status = ''
        logger.debug('job_data = {}'.format(job_data))

        params = json.loads(body)
        job_id = params['job_id']
        data_id = params['data_id']
        uid = params['_id']
        user = params['ARASTUSER']
        token = params['oauth_token']
        pipelines = params.get('pipeline')
        recipe = params.get('recipe')
        wasp_in = params.get('wasp')
        jobpath = os.path.join(self.datapath, user, str(data_id), str(job_id))

        url = shock.verify_shock_url(self.shockurl)

        self.start_time = time.time()

        timer_thread = UpdateTimer(self.metadata, 29, time.time(), uid, self.done_flag)
        timer_thread.start()

        #### Parse pipeline to wasp exp
        reload(recipes)
        if recipe:
            try: wasp_exp = recipes.get(recipe[0], job_id)
            except AttributeError: raise Exception('"{}" recipe not found.'.format(recipe[0]))
        elif wasp_in:
            wasp_exp = wasp_in[0]
        elif not pipelines:
            wasp_exp = recipes.get('auto', job_id)
        elif pipelines:
            ## Legacy client
            if pipelines[0] == 'auto':
                wasp_exp = recipes.get('auto', job_id)
            ##########
            else:
                if type(pipelines[0]) is not list: # --assemblers
                    pipelines = [pipelines]
                all_pipes = []
                for p in pipelines:
                    all_pipes += self.pmanager.parse_input(p)
                logger.debug("pipelines = {}".format(all_pipes))
                wasp_exp = wasp.pipelines_to_exp(all_pipes, params['job_id'])
        else:
            raise asmtypes.ArastClientRequestError('Malformed job request.')
        logger.debug('Wasp Expression: {}'.format(wasp_exp))
        w_engine = wasp.WaspEngine(self.pmanager, job_data, self.metadata)

        ###### Run Job
        try:
            w_engine.run_expression(wasp_exp, job_data)
            ###### Upload all result files and place them into appropriate tags
            uploaded_fsets = job_data.upload_results(url, token)

            # Format report
            new_report = open('{}.tmp'.format(self.out_report_name), 'w')

            ### Log errors
            if len(job_data['errors']) > 0:
                new_report.write('PIPELINE ERRORS\n')
                for i,e in enumerate(job_data['errors']):
                    new_report.write('{}: {}\n'.format(i, e))
            try: ## Get Quast output
                quast_report = job_data['wasp_chain'].find_module('quast')['data'].find_type('report')[0].files[0]
                with open(quast_report) as q:
                    new_report.write(q.read())
            except:
                new_report.write('No Summary File Generated!\n\n\n')
            self.out_report.close()
            with open(self.out_report_name) as old:
                new_report.write(old.read())

            for log in job_data['logfiles']:
                new_report.write('\n{1} {0} {1}\n'.format(os.path.basename(log), '='*20))
                with open(log) as l:
                    new_report.write(l.read())

            ### Log tracebacks
            if len(job_data['tracebacks']) > 0:
                new_report.write('EXCEPTION TRACEBACKS\n')
                for i,e in enumerate(job_data['tracebacks']):
                    new_report.write('{}: {}\n'.format(i, e))

            new_report.close()
            os.remove(self.out_report_name)
            shutil.move(new_report.name, self.out_report_name)
            res = self.upload(url, user, token, self.out_report_name)
            report_info = asmtypes.FileInfo(self.out_report_name, shock_url=url, shock_id=res['data']['id'])

            self.metadata.update_job(uid, 'report', [asmtypes.set_factory('report', [report_info])])
            status = 'Complete with errors' if job_data.get('errors') else 'Complete'

            ## Make compatible with JSON dumps()
            del job_data['out_report']
            del job_data['initial_reads']
            del job_data['raw_reads']
            self.metadata.update_job(uid, 'data', job_data)
            self.metadata.update_job(uid, 'result_data', uploaded_fsets)
            ###### Legacy Support #######
            filesets = uploaded_fsets.append(asmtypes.set_factory('report', [report_info]))
            contigsets = [fset for fset in uploaded_fsets if fset.type == 'contigs' or fset.type == 'scaffolds']
            download_ids = {fi['filename']: fi['shock_id'] for fset in uploaded_fsets for fi in fset['file_infos']}
            contig_ids = {fi['filename']: fi['shock_id'] for fset in contigsets for fi in fset['file_infos']}
            self.metadata.update_job(uid, 'result_data_legacy', [download_ids])
            self.metadata.update_job(uid, 'contig_ids', [contig_ids])
            ###################

            sys.stdout.flush()
            touch(os.path.join(jobpath, "_DONE_"))
            logger.info('============== JOB COMPLETE ===============')

        except asmtypes.ArastUserInterrupt:
            status = 'Terminated by user'
            sys.stdout.flush()
            touch(os.path.join(jobpath, "_CANCELLED__"))
            logger.info('============== JOB KILLED ===============')

        finally:
            self.remove_job_from_lists(job_data)
            logger.debug('Reinitialize plugin manager...') # Reinitialize to get live changes
            self.pmanager = ModuleManager(self.threads, self.kill_list, self.kill_list_lock, self.job_list, self.binpath, self.modulebin)

        self.metadata.update_job(uid, 'status', status)


    def remove_job_from_lists(self, job_data):
        self.job_list_lock.acquire()
        try:
            for i, job in enumerate(self.job_list):
                if job['user'] == job_data['user'] and job['job_id'] == job_data['job_id']:
                    self.job_list.pop(i)
        except:
            logger.error("Unexpected error in removing executed jobs from job_list")
            raise
        finally:
            self.job_list_lock.release()

        # kill_list cleanup for cases where a kill request is enqueued right before the corresponding job gets popped
        self.kill_list_lock.acquire()
        try:
            for i, kill_request in enumerate(self.kill_list):
                if kill_request['user'] == job_data['user'] and kill_request['job_id'] == job_data['job_id']:
                    self.kill_list.pop(i)
        except:
            logger.error("Unexpected error in removing executed jobs from kill_list")
            raise
        finally:
            self.kill_list_lock.release()


    def upload(self, url, user, token, file, filetype='default'):
        files = {}
        files["file"] = (os.path.basename(file), open(file, 'rb'))
        logger.debug("Message sent to shock on upload: %s" % files)
        sclient = shock.Shock(url, user, token)
        if filetype == 'contigs' or filetype == 'scaffolds':
            res = sclient.upload_contigs(file)
        else:
            res = sclient.upload_file(file, filetype, curl=True)
        return res

    def download_shock(self, url, user, token, node_id, outdir):
        sclient = shock.Shock(url, user, token)
        downloaded = sclient.curl_download_file(node_id, outdir=outdir)
        return self.extract_file(downloaded)

    def download_url(self, url, outdir, token=None):
        downloaded = shock.curl_download_url(url, outdir=outdir, token=token)
        return self.extract_file(downloaded)

    def fetch_job(self):
        connection = pika.BlockingConnection(pika.ConnectionParameters(
                host=self.rmq_host, port=self.rmq_port))
        channel = connection.channel()
        channel.basic_qos(prefetch_count=1)
        result = channel.queue_declare(exclusive=False,
                                       auto_delete=False,
                                       durable=True)
        logger.info('Fetching job...')

        channel.basic_qos(prefetch_count=1)
        for queue in self.queues:
            print 'Using queue: {}'.format(queue)
            channel.basic_consume(self.callback,
                              queue=queue)

        channel.start_consuming()

    def callback(self, ch, method, properties, body):
        params = json.loads(body)
        display = ['ARASTUSER', 'job_id', 'message', 'recipe', 'pipeline', 'wasp']
        logger.info('Incoming job: ' + ', '.join(['{}: {}'.format(k, params[k]) for k in display if params[k]]))
        logger.debug(params)
        job_doc = self.metadata.get_job(params['ARASTUSER'], params['job_id'])

        ## Check if job was not killed
        if job_doc is None:
            logger.error('Error: no job_doc found for {}'.format(params.get('job_id')))
            return

        if job_doc.get('status') == 'Terminated by user':
            logger.warn('Job {} was killed, skipping'.format(params.get('job_id')))
        else:
            self.done_flag = threading.Event()
            uid = None
            try:
                uid = job_doc['_id']
                self.compute(body)
            except Exception as e:
                tb = format_exc()
                status = "[FAIL] {}".format(e)
                logger.error("{}\n{}".format(status, tb))
                self.metadata.update_job(uid, 'status', status)
        ch.basic_ack(delivery_tag=method.delivery_tag)
        self.done_flag.set()

    def start(self):
        self.fetch_job()

    def extract_file(self, filename):
        """ Decompress files if necessary """
        unp_bin = os.path.join(self.modulebin, 'unp')

        filepath = os.path.dirname(filename)
        uncompressed = ['fasta', 'fa', 'fastq', 'fq', 'fna', 'h5' ]
        supported = ['tar.gz', 'tar.bz2', 'bz2', 'gz', 'lz',
                     'rar', 'tar', 'tgz','zip']
        for ext in uncompressed:
            if filename.endswith('.'+ext):
                return filename
        for ext in supported:
            if filename.endswith('.'+ext):
                extracted_file = filename[:filename.index(ext)-1]
                if os.path.exists(extracted_file): # Check extracted already
                    return extracted_file
                logger.info("Extracting {}...".format(filename))
                # p = subprocess.Popen([unp_bin, filename],
                #                      cwd=filepath, stderr=subprocess.STDOUT)
                # p.wait()
                # Hide the "broken pipe" message from unp
                out = subprocess.Popen([unp_bin, filename],
                                       cwd=filepath,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.STDOUT).communicate()[0]
                if os.path.exists(extracted_file):
                    return extracted_file
                else:
                    logger.error("Extraction of {} failed: {}".format(filename, out))
                    raise Exception('Archive structure error')
        logger.error("Could not extract {}".format(filename))
        return filename
Пример #8
0
    def compute(self, body):
        self.job_list_lock.acquire()
        try:
            job_data = self.prepare_job_data(body)
            self.job_list.append(job_data)
        except:
            logger.error("Error in adding new job to job_list")
            raise
        finally:
            self.job_list_lock.release()

        status = ''
        logger.debug('job_data = {}'.format(job_data))

        params = json.loads(body)
        job_id = params['job_id']
        data_id = params['data_id']
        uid = params['_id']
        user = params['ARASTUSER']
        token = params['oauth_token']
        pipelines = params.get('pipeline')
        recipe = params.get('recipe')
        wasp_in = params.get('wasp')
        jobpath = os.path.join(self.datapath, user, str(data_id), str(job_id))

        url = shock.verify_shock_url(self.shockurl)

        self.start_time = time.time()

        timer_thread = UpdateTimer(self.metadata, 29, time.time(), uid, self.done_flag)
        timer_thread.start()

        #### Parse pipeline to wasp exp
        reload(recipes)
        if recipe:
            try: wasp_exp = recipes.get(recipe[0], job_id)
            except AttributeError: raise Exception('"{}" recipe not found.'.format(recipe[0]))
        elif wasp_in:
            wasp_exp = wasp_in[0]
        elif not pipelines:
            wasp_exp = recipes.get('auto', job_id)
        elif pipelines:
            ## Legacy client
            if pipelines[0] == 'auto':
                wasp_exp = recipes.get('auto', job_id)
            ##########
            else:
                if type(pipelines[0]) is not list: # --assemblers
                    pipelines = [pipelines]
                all_pipes = []
                for p in pipelines:
                    all_pipes += self.pmanager.parse_input(p)
                logger.debug("pipelines = {}".format(all_pipes))
                wasp_exp = wasp.pipelines_to_exp(all_pipes, params['job_id'])
        else:
            raise asmtypes.ArastClientRequestError('Malformed job request.')
        logger.debug('Wasp Expression: {}'.format(wasp_exp))
        w_engine = wasp.WaspEngine(self.pmanager, job_data, self.metadata)

        ###### Run Job
        try:
            w_engine.run_expression(wasp_exp, job_data)
            ###### Upload all result files and place them into appropriate tags
            uploaded_fsets = job_data.upload_results(url, token)

            # Format report
            new_report = open('{}.tmp'.format(self.out_report_name), 'w')

            ### Log errors
            if len(job_data['errors']) > 0:
                new_report.write('PIPELINE ERRORS\n')
                for i,e in enumerate(job_data['errors']):
                    new_report.write('{}: {}\n'.format(i, e))
            try: ## Get Quast output
                quast_report = job_data['wasp_chain'].find_module('quast')['data'].find_type('report')[0].files[0]
                with open(quast_report) as q:
                    new_report.write(q.read())
            except:
                new_report.write('No Summary File Generated!\n\n\n')
            self.out_report.close()
            with open(self.out_report_name) as old:
                new_report.write(old.read())

            for log in job_data['logfiles']:
                new_report.write('\n{1} {0} {1}\n'.format(os.path.basename(log), '='*20))
                with open(log) as l:
                    new_report.write(l.read())

            ### Log tracebacks
            if len(job_data['tracebacks']) > 0:
                new_report.write('EXCEPTION TRACEBACKS\n')
                for i,e in enumerate(job_data['tracebacks']):
                    new_report.write('{}: {}\n'.format(i, e))

            new_report.close()
            os.remove(self.out_report_name)
            shutil.move(new_report.name, self.out_report_name)
            res = self.upload(url, user, token, self.out_report_name)
            report_info = asmtypes.FileInfo(self.out_report_name, shock_url=url, shock_id=res['data']['id'])

            self.metadata.update_job(uid, 'report', [asmtypes.set_factory('report', [report_info])])
            status = 'Complete with errors' if job_data.get('errors') else 'Complete'

            ## Make compatible with JSON dumps()
            del job_data['out_report']
            del job_data['initial_reads']
            del job_data['raw_reads']
            self.metadata.update_job(uid, 'data', job_data)
            self.metadata.update_job(uid, 'result_data', uploaded_fsets)
            ###### Legacy Support #######
            filesets = uploaded_fsets.append(asmtypes.set_factory('report', [report_info]))
            contigsets = [fset for fset in uploaded_fsets if fset.type == 'contigs' or fset.type == 'scaffolds']
            download_ids = {fi['filename']: fi['shock_id'] for fset in uploaded_fsets for fi in fset['file_infos']}
            contig_ids = {fi['filename']: fi['shock_id'] for fset in contigsets for fi in fset['file_infos']}
            self.metadata.update_job(uid, 'result_data_legacy', [download_ids])
            self.metadata.update_job(uid, 'contig_ids', [contig_ids])
            ###################

            sys.stdout.flush()
            touch(os.path.join(jobpath, "_DONE_"))
            logger.info('============== JOB COMPLETE ===============')

        except asmtypes.ArastUserInterrupt:
            status = 'Terminated by user'
            sys.stdout.flush()
            touch(os.path.join(jobpath, "_CANCELLED__"))
            logger.info('============== JOB KILLED ===============')

        finally:
            self.remove_job_from_lists(job_data)
            logger.debug('Reinitialize plugin manager...') # Reinitialize to get live changes
            self.pmanager = ModuleManager(self.threads, self.kill_list, self.kill_list_lock, self.job_list, self.binpath, self.modulebin)

        self.metadata.update_job(uid, 'status', status)
Пример #9
0
class ArastConsumer:
    def __init__(self, shockurl, arasturl, config, threads, queue, kill_queue, job_list, ctrl_conf):
        self.parser = SafeConfigParser()
        self.parser.read(config)
        self.job_list = job_list
        # Load plugins
        self.pmanager = ModuleManager(threads, kill_queue, job_list)

    # Set up environment
        self.shockurl = shockurl
        self.arasturl = arasturl
        self.datapath = self.parser.get('compute','datapath')
        if queue:
            self.queue = queue
            print('Using queue:{}'.format(self.queue))
        else:
            self.queue = self.parser.get('rabbitmq','default_routing_key')
        self.min_free_space = float(self.parser.get('compute','min_free_space'))
        m = ctrl_conf['meta']        
        a = ctrl_conf['assembly']
        
        self.metadata = meta.MetadataConnection(arasturl, int(a['mongo_port']), m['mongo.db'],
                                                m['mongo.collection'], m['mongo.collection.auth'])
        self.gc_lock = multiprocessing.Lock()

    def garbage_collect(self, datapath, user, required_space):
        """ Monitor space of disk containing DATAPATH and delete files if necessary."""
        self.gc_lock.acquire()
        s = os.statvfs(datapath)
        free_space = float(s.f_bsize * s.f_bavail)
        logging.debug("Free space in bytes: %s" % free_space)
        logging.debug("Required space in bytes: %s" % required_space)
        while ((free_space - self.min_free_space) < required_space):
            #Delete old data
            dirs = os.listdir(os.path.join(datapath, user))
            times = []
            for dir in dirs:
                times.append(os.path.getmtime(os.path.join(datapath, user, dir)))
            if len(dirs) > 0:
                old_dir = os.path.join(datapath, user, dirs[times.index(min(times))])
                shutil.rmtree(old_dir, ignore_errors=True)
            else:
                logging.error("No more directories to remove")
                break
            logging.info("Space required.  %s removed." % old_dir)
            s = os.statvfs(datapath)
            free_space = float(s.f_bsize * s.f_bavail)
            logging.debug("Free space in bytes: %s" % free_space)
        self.gc_lock.release()


    def get_data(self, body):
        """Get data from cache or Shock server."""
        params = json.loads(body)
        if 'assembly_data' in params:
            logging.info('New Data Format')
            return self._get_data(body)
        else:
            return self._get_data_old(body)

    def _get_data(self, body):
        params = json.loads(body)
        filepath = os.path.join(self.datapath, params['ARASTUSER'],
                                str(params['data_id']))
        datapath = filepath
        filepath += "/raw/"
        all_files = []
        user = params['ARASTUSER']
        token = params['oauth_token']
        uid = params['_id']

        ##### Get data from ID #####
        data_doc = self.metadata.get_doc_by_data_id(params['data_id'], params['ARASTUSER'])
        if not data_doc:
            raise Exception('Invalid Data ID: {}'.format(params['data_id']))

        if 'kbase_assembly_input' in data_doc:
            params['assembly_data'] = kb_to_asm(data_doc['kbase_assembly_input'])
        elif 'assembly_data' in data_doc:
            params['assembly_data'] = data_doc['assembly_data']

        ##### Get data from assembly_data #####
        self.metadata.update_job(uid, 'status', 'Data transfer')
        try:os.makedirs(filepath)
        except:pass
            
          ### TODO Garbage collect ###
        download_url = 'http://{}'.format(self.shockurl)
        file_sets = params['assembly_data']['file_sets']
        for file_set in file_sets:
            file_set['files'] = [] #legacy
            for file_info in file_set['file_infos']:
                local_file = os.path.join(filepath, file_info['filename'])
                if os.path.exists(local_file):
                    logging.info("Requested data exists on node: {}".format(local_file))
                else:
                    local_file = self.download(download_url, user, token, 
                                               file_info['shock_id'], filepath)
                file_info['local_file'] = local_file
                file_set['files'].append(local_file) #legacy
            all_files.append(file_set)
        return datapath, all_files                    

    def _get_data_old(self, body):
        params = json.loads(body)
        #filepath = self.datapath + str(params['data_id'])
        filepath = os.path.join(self.datapath, params['ARASTUSER'],
                                str(params['data_id']))
        datapath = filepath
        filepath += "/raw/"
        all_files = []

        uid = params['_id']
        job_id = params['job_id']
        user = params['ARASTUSER']

        data_doc = self.metadata.get_doc_by_data_id(params['data_id'], params['ARASTUSER'])
        if data_doc:
            paired = data_doc['pair']
            single = data_doc['single']
            files = data_doc['filename']
            ids = data_doc['ids']
            token = params['oauth_token']
            try:
                ref = data_doc['reference']
            except:
                pass
        else:
            self.metadata.update_job(uid, 'status', 'Invalid Data ID')
            raise Exception('Data {} does not exist on Shock Server'.format(
                    params['data_id']))

        all_files = []
        if os.path.isdir(filepath):
            logging.info("Requested data exists on node")
            try:
                for l in paired:
                    filedict = {'type':'paired', 'files':[]}
                    for word in l:
                        if is_filename(word):
                            baseword = os.path.basename(word)
                            filedict['files'].append(
                                extract_file(os.path.join(filepath,  baseword)))
                        else:
                            kv = word.split('=')
                            filedict[kv[0]] = kv[1]
                    all_files.append(filedict)
            except:
                logging.info('No paired files submitted')

            try:
                for seqfiles in single:
                    for wordpath in seqfiles:
                        filedict = {'type':'single', 'files':[]}    
                        if is_filename(wordpath):
                            baseword = os.path.basename(wordpath)
                            filedict['files'].append(
                                extract_file(os.path.join(filepath, baseword)))
                        else:
                            kv = word.split('=')
                            filedict[kv[0]] = kv[1]
                        all_files.append(filedict)
            except:
                logging.info(format_tb(sys.exc_info()[2]))
                logging.info('No single files submitted!')
            
            try:
                for r in ref:
                    for wordpath in r:
                        filedict = {'type':'reference', 'files':[]}    
                        if is_filename(wordpath):
                            baseword = os.path.basename(wordpath)
                            filedict['files'].append(
                                extract_file(os.path.join(filepath, baseword)))
                        else:
                            kv = word.split('=')
                            filedict[kv[0]] = kv[1]
                        all_files.append(filedict)
            except:
                logging.info(format_tb(sys.exc_info()[2]))
                logging.info('No reference files submitted!')
            
    
            touch(datapath)

        ## Data does not exist on current compute node
        else:
            self.metadata.update_job(uid, 'status', 'Data transfer')
            os.makedirs(filepath)

            # Get required space and garbage collect
            try:
                req_space = 0
                for file_size in data_doc['file_sizes']:
                    req_space += file_size
                self.garbage_collect(self.datapath, user, req_space)
            except:
                pass 
            url = "http://%s" % (self.shockurl)

            try:
                for l in paired:
                    #FILEDICT contains a single read library's info
                    filedict = {'type':'paired', 'files':[]}
                    for word in l:
                        if is_filename(word):
                            baseword = os.path.basename(word)
                            dl = self.download(url, user, token, 
                                               ids[files.index(baseword)], filepath)
                            if shock.parse_handle(dl): #Shock handle, get real data
                                logging.info('Found shock handle, getting real data...')
                                s_addr, s_id = shock.parse_handle(dl)
                                s_url = 'http://{}'.format(s_addr)
                                real_file = self.download(s_url, user, token, 
                                                          s_id, filepath)
                                filedict['files'].append(real_file)
                            else:
                                filedict['files'].append(dl)
                        elif re.search('=', word):
                            kv = word.split('=')
                            filedict[kv[0]] = kv[1]
                    all_files.append(filedict)
            except:
                logging.info(format_exc(sys.exc_info()))
                logging.info('No paired files submitted')

            try:
                for seqfiles in single:
                    for wordpath in seqfiles:
                        filedict = {'type':'single', 'files':[]}
                        # Parse user directories
                        try:
                            path, word = wordpath.rsplit('/', 1)
                            path += '/'
                        except:
                            word = wordpath
                            path = ''

                        if is_filename(word):
                            baseword = os.path.basename(word)
                            dl = self.download(url, user, token, 
                                               ids[files.index(baseword)], filepath)
                            if shock.parse_handle(dl): #Shock handle, get real data
                                logging.info('Found shock handle, getting real data...')
                                s_addr, s_id = shock.parse_handle(dl)
                                s_url = 'http://{}'.format(s_addr)
                                real_file = self.download(s_url, user, token, 
                                                          s_id, filepath)
                                filedict['files'].append(real_file)
                            else:
                                filedict['files'].append(dl)
                        elif re.search('=', word):
                            kv = word.split('=')
                            filedict[kv[0]] = kv[1]
                        all_files.append(filedict)
            except:
                logging.info(format_exc(sys.exc_info()))
                logging.info('No single end files submitted')

            try:
                for r in ref:
                    for wordpath in r:
                        filedict = {'type':'reference', 'files':[]}
                        # Parse user directories
                        try:
                            path, word = wordpath.rsplit('/', 1)
                            path += '/'
                        except:
                            word = wordpath
                            path = ''

                        if is_filename(word):
                            baseword = os.path.basename(word)
                            dl = self.download(url, user, token, 
                                               ids[files.index(baseword)], filepath)
                            if shock.parse_handle(dl): #Shock handle, get real data
                                logging.info('Found shock handle, getting real data...')
                                s_addr, s_id = shock.parse_handle(dl)
                                s_url = 'http://{}'.format(s_addr)
                                real_file = self.download(s_url, user, token, 
                                                          s_id, filepath)
                                filedict['files'].append(real_file)
                            else:
                                filedict['files'].append(dl)
                        elif re.search('=', word):
                            kv = word.split('=')
                            filedict[kv[0]] = kv[1]
                        all_files.append(filedict)
            except:
                #logging.info(format_exc(sys.exc_info()))
                logging.info('No single end files submitted')

        print all_files
        return datapath, all_files


    def compute(self, body):
        error = False
        params = json.loads(body)
        job_id = params['job_id']
        uid = params['_id']
        user = params['ARASTUSER']
        token = params['oauth_token']
        pipelines = params['pipeline']

        #support legacy arast client
        if len(pipelines) > 0:
            if type(pipelines[0]) is not list:
                pipelines = [pipelines]
                
        ### Download files (if necessary)
        datapath, all_files = self.get_data(body)
        rawpath = datapath + '/raw/'
        jobpath = os.path.join(datapath, str(job_id))
        try:
            os.makedirs(jobpath)
        except:
            raise Exception ('Data Error')

        ### Create job log
        self.out_report_name = '{}/{}_report.txt'.format(jobpath, str(job_id))
        self.out_report = open(self.out_report_name, 'w')

        ### Create data to pass to pipeline
        reads = []
        reference = []
        for fileset in all_files:
            if len(fileset['files']) != 0:
                if (fileset['type'] == 'single' or 
                    fileset['type'] == 'paired'):
                    reads.append(fileset)
                elif fileset['type'] == 'reference':
                    reference.append(fileset)
                else:
                    raise Exception('fileset error')

        job_data = ArastJob({'job_id' : params['job_id'], 
                    'uid' : params['_id'],
                    'user' : params['ARASTUSER'],
                    'reads': reads,
                    'reference': reference,
                    'initial_reads': list(reads),
                    'raw_reads': copy.deepcopy(reads),
                    'processed_reads': list(reads),
                    'pipeline_data': {},
                    'datapath': datapath,
                    'out_report' : self.out_report,
                    'logfiles': []})

        self.out_report.write("Arast Pipeline: Job {}\n".format(job_id))
        self.job_list.append(job_data)
        self.start_time = time.time()
        self.done_flag = threading.Event()
        timer_thread = UpdateTimer(self.metadata, 29, time.time(), uid, self.done_flag)
        timer_thread.start()
        
        download_ids = {}
        contig_ids = {}

        url = "http://%s" % (self.shockurl)
#        url += '/node'
        try:
            include_all_data = params['all_data']
        except:
            include_all_data = False
        contigs = not include_all_data
        status = ''

        ## TODO CHANGE: default pipeline
        default_pipe = ['velvet']
        exceptions = []

        if pipelines:
            try:
                if pipelines == ['auto']:
                    pipelines = [default_pipe,]
                for p in pipelines:
                    self.pmanager.validate_pipe(p)

                result_files, summary, contig_files, exceptions = self.run_pipeline(pipelines, job_data, contigs_only=contigs)
                for i, f in enumerate(result_files):
                    #fname = os.path.basename(f).split('.')[0]
                    fname = str(i)
                    res = self.upload(url, user, token, f)
                    download_ids[fname] = res['data']['id']
                    
                for c in contig_files:
                    fname = os.path.basename(c).split('.')[0]
                    res = self.upload(url, user, token, c, filetype='contigs')
                    contig_ids[fname] = res['data']['id']

                # Check if job completed with no errors
                if exceptions:
                    status = 'Complete with errors'
                elif not summary:
                    status = 'Complete: No valid contigs'
                else:
                    status += "Complete"
                self.out_report.write("Pipeline completed successfully\n")
            except:
                traceback = format_exc(sys.exc_info())
                status = "[FAIL] {}".format(sys.exc_info()[1])
                print traceback
                self.out_report.write("ERROR TRACE:\n{}\n".
                                      format(format_tb(sys.exc_info()[2])))

        # Format report
        for i, job in enumerate(self.job_list):
            if job['user'] == job_data['user'] and job['job_id'] == job_data['job_id']:
                self.job_list.pop(i)
        self.done_flag.set()
        new_report = open('{}.tmp'.format(self.out_report_name), 'w')

        ### Log exceptions
        if len(exceptions) > 0:
            new_report.write('PIPELINE ERRORS')
            for i,e in enumerate(exceptions):
                new_report.write('{}: {}\n'.format(i, e))
        try:
            for sum in summary:
                with open(sum) as s:
                    new_report.write(s.read())
        except:
            new_report.write('No Summary File Generated!\n\n\n')
        self.out_report.close()
        with open(self.out_report_name) as old:
            new_report.write(old.read())
        new_report.close()
        os.remove(self.out_report_name)
        shutil.move(new_report.name, self.out_report_name)
        res = self.upload(url, user, token, self.out_report_name)
        download_ids['report'] = res['data']['id']

        # Get location
        self.metadata.update_job(uid, 'result_data', download_ids)
        self.metadata.update_job(uid, 'contig_ids', contig_ids)
        self.metadata.update_job(uid, 'status', status)

        print '=========== JOB COMPLETE ============'

    def update_time_record(self):
        elapsed_time = time.time() - self.start_time
        ftime = str(datetime.timedelta(seconds=int(elapsed_time)))
        self.metadata.update_job(uid, 'computation_time', ftime)

    def run_pipeline(self, pipes, job_data, contigs_only=True):
        """
        Runs all pipelines in list PIPES
        """
        all_pipes = []
        for p in pipes:
            all_pipes += self.pmanager.parse_input(p)
        logging.info('{} pipelines:'.format(len(all_pipes)))
        for p in all_pipes:
            print '->'.join(p)
        #include_reads = self.pmanager.output_type(pipeline[-1]) == 'reads'
        include_reads = False
        pipeline_num = 1
        all_files = []
        pipe_outputs = []
        logfiles = []
        ale_reports = {}
        final_contigs = []
        final_scaffolds = []
        output_types = []
        exceptions = []
        num_pipes = len(all_pipes)
        for pipe in all_pipes:
            try:
                #job_data = copy.deepcopy(job_data_global)
                #job_data['out_report'] = job_data_global['out_report'] 
                pipeline, overrides = self.pmanager.parse_pipe(pipe)
                job_data.add_pipeline(pipeline_num, pipeline)
                num_stages = len(pipeline)
                pipeline_stage = 1
                pipeline_results = []
                cur_outputs = []

                # Reset job data 
                job_data['reads'] = copy.deepcopy(job_data['raw_reads'])
                job_data['processed_reads'] = []
                print job_data

                self.out_report.write('\n{0} Pipeline {1}: {2} {0}\n'.format('='*15, pipeline_num, pipe))
                pipe_suffix = '' # filename code for indiv pipes
                pipe_start_time = time.time()
                pipe_alive = True

                # Store data record for pipeline

                for module_name in pipeline:
                    if not pipe_alive:
                        self.out_report.write('\n{0} Module Failure, Killing Pipe {0}'.format(
                                'X'*10))
                        break
                    module_code = '' # unique code for data reuse
                    print '\n\n{0} Running module: {1} {2}'.format(
                        '='*20, module_name, '='*(35-len(module_name)))
                    self.garbage_collect(self.datapath, job_data['user'], 2147483648) # 2GB

                    ## PROGRESS CALCULATION
                    pipes_complete = (pipeline_num - 1) / float(num_pipes)
                    stage_complete = (pipeline_stage - 1) / float(num_stages)
                    pct_segment = 1.0 / num_pipes
                    stage_complete *= pct_segment
                    total_complete = pipes_complete + stage_complete
                    cur_state = 'Running:[{}%|P:{}/{}|S:{}/{}|{}]'.format(
                        int(total_complete * 100), pipeline_num, num_pipes,
                        pipeline_stage, num_stages, module_name)
                    self.metadata.update_job(job_data['uid'], 'status', cur_state)

                    ## LOG REPORT For now, module code is 1st and last letter
                    short_name = self.pmanager.get_short_name(module_name)
                    if short_name:
                        #pipe_suffix += short_name.capitalize()
                        module_code += short_name.capitalize()
                    else:
                        #pipe_suffix += module_name[0].upper() + module_name[-1]
                        module_code += module_name[0].upper() + module_name[-1]
                    mod_overrides =  overrides[pipeline_stage - 1]
                    for k in mod_overrides.keys():
                                #pipe_suffix += '_{}{}'.format(k[0], par[k])
                        module_code += '_{}{}'.format(k[0], mod_overrides[k])
                    pipe_suffix += module_code
                    self.out_report.write('PIPELINE {} -- STAGE {}: {}\n'.format(
                            pipeline_num, pipeline_stage, module_name))
                    logging.debug('New job_data for stage {}: {}'.format(
                            pipeline_stage, job_data))
                    job_data['params'] = overrides[pipeline_stage-1].items()
                    module_start_time = time.time()
                    ## RUN MODULE
                    # Check if output data exists
                    reuse_data = False
                    enable_reuse = True # KILL SWITCH
                    if enable_reuse:
                        for k, pipe in enumerate(pipe_outputs):
                            if reuse_data:
                                break
                            if not pipe:
                                continue
                            # Check that all previous pipes match
                            for i in range(pipeline_stage):
                                try:
                                    if not pipe[i][0] == cur_outputs[i][0]:
                                        break
                                except:
                                    pass
                                try:
                                    if (pipe[i][0] == module_code and i == pipeline_stage - 1):
                                        #and overrides[i].items() == job_data['params']): #copy!
                                        print('Found previously computed data, reusing {}.'.format(
                                                module_code))
                                        output = [] + pipe[i][1]
                                        pfix = (k+1, i+1)
                                        alldata = [] + pipe[i][2]
                                        reuse_data = True
                                        job_data.get_pipeline(pipeline_num).get_module(
                                            pipeline_stage)['elapsed_time'] = time.time(
                                            job_data.get_pipeline(i).get_module(
                                                    pipeline_stage)['elapsed_time'])

                                        break
                                except: # Previous pipes may be shorter
                                    pass

                    output_type = self.pmanager.output_type(module_name)

                    if not reuse_data:
                        output, alldata, mod_log = self.pmanager.run_module(
                            module_name, job_data, all_data=True, reads=include_reads)

                        ##### Module produced no output, attach log and proceed to next #####
                        if not output:
                            pipe_alive = False
                            try:
                                print mod_log
                                logfiles.append(mod_log)
                            except:
                                print 'error attaching ', mod_log
                            break


                        ##### Prefix outfiles with pipe stage (only assembler modules) #####
                        alldata = [asm.prefix_file_move(
                                file, "P{}_S{}_{}".format(pipeline_num, pipeline_stage, module_name)) 
                                    for file in alldata]
                        module_elapsed_time = time.time() - module_start_time
                        job_data.get_pipeline(pipeline_num).get_module(
                            pipeline_stage)['elapsed_time'] = module_elapsed_time


                        if alldata: #If log was renamed
                            mod_log = asm.prefix_file(mod_log, "P{}_S{}_{}".format(
                                    pipeline_num, pipeline_stage, module_name))

                    if output_type == 'contigs' or output_type == 'scaffolds': #Assume assembly contigs
                        if reuse_data:
                            p_num, p_stage = pfix
                        else:
                            p_num, p_stage = pipeline_num, pipeline_stage

                        # If plugin returned scaffolds
                        if type(output) is tuple and len(output) == 2:
                            out_contigs = output[0]
                            out_scaffolds = output[1]
                            cur_scaffolds = [asm.prefix_file(
                                    file, "P{}_S{}_{}".format(p_num, p_stage, module_name)) 
                                        for file in out_scaffolds]
                        else:
                            out_contigs = output
                        cur_contigs = [asm.prefix_file(
                                file, "P{}_S{}_{}".format(p_num, p_stage, module_name)) 
                                    for file in out_contigs]

                        #job_data['reads'] = asm.arast_reads(alldata)
                        job_data['contigs'] = cur_contigs

                    elif output_type == 'reads': #Assume preprocessing
                        if include_reads and reuse_data: # data was prefixed and moved
                            for d in output:
                                files = [asm.prefix_file(f, "P{}_S{}_{}".format(
                                            pipeline_num, pipeline_stage, module_name)) for f in d['files']]
                                d['files'] = files
                                d['short_reads'] = [] + files
                        job_data['reads'] = output
                        job_data['processed_reads'] = list(job_data['reads'])
                        
                    else: # Generic return, don't use in further stages
                        pipeline_results += output
                        logging.info('Generic plugin output: {}'.format(output))
   

                    if pipeline_stage == num_stages: # Last stage, add contig for assessment
                        if output and (output_type == 'contigs' or output_type == 'scaffolds'): #If a contig was produced
                            fcontigs = cur_contigs
                            rcontigs = [asm.rename_file_symlink(f, 'P{}_{}'.format(
                                        pipeline_num, pipe_suffix)) for f in fcontigs]
                            try:
                                rscaffolds = [asm.rename_file_symlink(f, 'P{}_{}_{}'.format(
                                            pipeline_num, pipe_suffix, 'scaff')) for f in cur_scaffolds]
                                if rscaffolds:
                                    scaffold_data = {'files': rscaffolds, 'name': pipe_suffix}
                                    final_scaffolds.append(scaffold_data)
                                    output_types.append(output_type)
                            except:
                                pass
                            if rcontigs:
                                contig_data = {'files': rcontigs, 'name': pipe_suffix, 'alignment_bam': []}
                                final_contigs.append(contig_data)
                                output_types.append(output_type)
                    try:
                        logfiles.append(mod_log)
                    except:
                        print 'error attaching ', mod_log
                    pipeline_stage += 1
                    cur_contigs = []
                    cur_scaffolds = []

                    cur_outputs.append([module_code, output, alldata])
                pipe_elapsed_time = time.time() - pipe_start_time
                pipe_ftime = str(datetime.timedelta(seconds=int(pipe_elapsed_time)))
                job_data.get_pipeline(pipeline_num)['elapsed_time'] = pipe_elapsed_time



                if not output:
                    self.out_report.write('ERROR: No contigs produced. See module log\n')
                else:

                    ## Assessment
                    #self.pmanager.run_module('reapr', job_data)
                    #print job_data
                    # TODO reapr break may be diff from final reapr align!
                    # ale_out, _, _ = self.pmanager.run_module('ale', job_data)
                    # if ale_out:
                    #     job_data.get_pipeline(pipeline_num).import_ale(ale_out)
                    #     ale_reports[pipe_suffix] = ale_out
                    pipeline_datapath = '{}/{}/pipeline{}/'.format(job_data['datapath'], 
                                                                   job_data['job_id'],
                                                                   pipeline_num)
                    try:
                        os.makedirs(pipeline_datapath)
                    except:
                        logging.info("{} exists, skipping mkdir".format(pipeline_datapath))

                    # all_files.append(asm.tar_list(pipeline_datapath, pipeline_results, 
                    #                     'pipe{}_{}.tar.gz'.format(pipeline_num, pipe_suffix)))

                    all_files += pipeline_results

                self.out_report.write('Pipeline {} total time: {}\n\n'.format(pipeline_num, pipe_ftime))
                job_data.get_pipeline(pipeline_num)['name'] = pipe_suffix
                pipe_outputs.append(cur_outputs)
                pipeline_num += 1

            except:
                print "ERROR: Pipeline #{} Failed".format(pipeline_num)
                print format_exc(sys.exc_info())
                e = str(sys.exc_info()[1])
                if e.find('Terminated') != -1:
                    raise Exception(e)
                exceptions.append(module_name + ':\n' + str(sys.exc_info()[1]))
                pipeline_num += 1

        ## ANALYSIS: Quast
        job_data['final_contigs'] = final_contigs
        job_data['final_scaffolds'] = final_scaffolds
        job_data['params'] = [] #clear overrides from last stage

        summary = []  # Quast reports for contigs and scaffolds
        try: #Try to assess, otherwise report pipeline errors
            if job_data['final_contigs']:
                    job_data['contig_type'] = 'contigs'
                    quast_report, quast_tar, z1, q_log = self.pmanager.run_module('quast', job_data, 
                                                                                  tar=True, meta=True)
                    if quast_report:
                        summary.append(quast_report[0])
                    with open(q_log) as infile:
                        self.out_report.write(infile.read())
            else:
                quast_report, quast_tar = '',''

            if job_data['final_scaffolds']:
                scaff_data = dict(job_data)
                scaff_data['final_contigs'] = job_data['final_scaffolds']
                scaff_data['contig_type'] = 'scaffolds'
                scaff_report, scaff_tar, _, scaff_log = self.pmanager.run_module('quast', scaff_data, 
                                                                          tar=True, meta=True)
                scaffold_quast = True
                if scaff_report:
                    summary.append(scaff_report[0])
                with open(scaff_log) as infile:
                    self.out_report.write('\n Quast Report - Scaffold Mode \n')
                    self.out_report.write(infile.read())
            else:
                scaffold_quast = False
        except:
            if exceptions:
                if len(exceptions) > 1:
                    raise Exception('Multiple Errors')
                else:
                    raise Exception(exceptions[0])
            else:
                raise Exception(str(sys.exc_info()[1]))


        ## CONCAT MODULE LOG FILES
        self.out_report.write("\n\n{0} Begin Module Logs {0}\n".format("="*10))
        for log in logfiles:
            self.out_report.write("\n\n{0} Begin Module {0}\n".format("="*10))
            try:
                with open(log) as infile:
                    self.out_report.write(infile.read())
            except:
                self.out_report.write("Error writing log file")



        ## Format Returns
        ctg_analysis = quast_tar.rsplit('/', 1)[0] + '/{}_ctg_qst.tar.gz'.format(job_data['job_id'])
        try:
            os.rename(quast_tar, ctg_analysis)
            return_files = [ctg_analysis]
        except:
            #summary = ''
            return_files = []

        if scaffold_quast:
            scf_analysis = scaff_tar.rsplit('/', 1)[0] + '/{}_scf_qst.tar.gz'.format(job_data['job_id'])
            #summary = quast_report[0]
            os.rename(scaff_tar, scf_analysis)
            return_files.append(scf_analysis)

        contig_files = []
        for data in final_contigs + final_scaffolds:
            for f in data['files']:
                contig_files.append(os.path.realpath(f))

        return_files += all_files

        ## Deduplicate
        seen = set()
        for f in return_files:
            seen.add(f)
        return_files = [f for f in seen]

        #if exceptions:        
            # if len(exceptions) > 1:
            #     raise Exception('Multiple Errors')
            # else:
            #     raise Exception(exceptions[0])

        if contig_files:
            return_files.append(asm.tar_list('{}/{}'.format(job_data['datapath'], job_data['job_id']),
                                             contig_files, '{}_assemblies.tar.gz'.format(
                        job_data['job_id'])))
        print "return files: {}".format(return_files)

        return return_files, summary, contig_files, exceptions


    def upload(self, url, user, token, file, filetype='default'):
        files = {}
        files["file"] = (os.path.basename(file), open(file, 'rb'))
        logging.debug("Message sent to shock on upload: %s" % files)
        sclient = shock.Shock(url, user, token)
        if filetype == 'default':
            res = sclient.upload_misc(file, 'default')
        elif filetype == 'contigs':
            res = sclient.upload_contigs(file)
        return res

    def download(self, url, user, token, node_id, outdir):
        sclient = shock.Shock(url, user, token)
        downloaded = sclient.curl_download_file(node_id, outdir=outdir)
        return extract_file(downloaded)

    def fetch_job(self):
        connection = pika.BlockingConnection(pika.ConnectionParameters(
                host = self.arasturl))
        channel = connection.channel()
        channel.basic_qos(prefetch_count=1)
        result = channel.queue_declare(queue=self.queue,
                                       exclusive=False,
                                       auto_delete=False,
                                       durable=True)

        logging.basicConfig(format=("%(asctime)s %s %(levelname)-8s %(message)s",proc().name))
        print proc().name, ' [*] Fetching job...'

        channel.basic_qos(prefetch_count=1)
        channel.basic_consume(self.callback,
                              queue=self.queue)


        channel.start_consuming()

    def callback(self, ch, method, properties, body):
        print " [*] %r:%r" % (method.routing_key, body)
        params = json.loads(body)
        job_doc = self.metadata.get_job(params['ARASTUSER'], params['job_id'])
        uid = job_doc['_id']
        ## Check if job was not killed
        if job_doc['status'] == 'Terminated':
            print 'Job {} was killed, skipping'.format(params['job_id'])
        else:
            try:
                self.compute(body)
            except:
                print sys.exc_info()
                status = "[FAIL] {}".format(format_tb(sys.exc_info()[2]))
                print logging.error(status)
                self.metadata.update_job(uid, 'status', status)
        ch.basic_ack(delivery_tag=method.delivery_tag)

    def start(self):
        self.fetch_job()
Пример #10
0
class ArastConsumer:
    def __init__(self, shockurl, rmq_host, rmq_port, mongo_host, mongo_port,
                 config, threads, queues, kill_list, kill_list_lock, job_list,
                 job_list_lock, ctrl_conf, datapath, binpath, modulebin):
        self.parser = SafeConfigParser()
        self.parser.read(config)
        self.kill_list = kill_list
        self.kill_list_lock = kill_list_lock
        self.job_list = job_list
        self.job_list_lock = job_list_lock
        # Load plugins
        self.threads = threads
        self.binpath = binpath
        self.modulebin = modulebin
        self.pmanager = ModuleManager(threads, kill_list, kill_list_lock,
                                      job_list, binpath, modulebin)

        # Set up environment
        self.shockurl = shockurl
        self.datapath = datapath
        self.rmq_host = rmq_host
        self.rmq_port = rmq_port
        self.mongo_host = mongo_host
        self.mongo_port = mongo_port
        self.queues = queues
        self.min_free_space = float(
            self.parser.get('compute', 'min_free_space'))
        self.data_expiration_days = float(
            self.parser.get('compute', 'data_expiration_days'))
        m = ctrl_conf['meta']
        a = ctrl_conf['assembly']

        collections = {
            'jobs': m.get('mongo.collection', 'jobs'),
            'auth': m.get('mongo.collection.auth', 'auth'),
            'data': m.get('mongo.collection.data', 'data'),
            'running': m.get('mongo.collection.running', 'running_jobs')
        }

        ###### TODO Use REST API
        self.metadata = meta.MetadataConnection(self.mongo_host,
                                                self.mongo_port, m['mongo.db'],
                                                collections)
        self.gc_lock = multiprocessing.Lock()

    def garbage_collect(self, datapath, required_space, user, job_id, data_id):
        """ Monitor space of disk containing DATAPATH and delete files if necessary."""
        datapath = self.datapath
        required_space = self.min_free_space
        expiration = self.data_expiration_days

        ### Remove expired directories
        def can_remove(d, user, job_id, data_id):
            u, data, j = d.split('/')[-4:-1]
            if u == user and j == str(job_id):
                return False
            if data == str(data_id) and j == 'raw':
                return False
            if os.path.isdir(d):
                return True
            return False

        dir_depth = 3
        dirs = filter(lambda f: can_remove(f, user, job_id, data_id),
                      glob.glob(datapath + '/' + '*/' * dir_depth))
        removed = []
        logger.info(
            'Searching for directories older than {} days'.format(expiration))
        for d in dirs:
            file_modified = None
            try:
                file_modified = datetime.datetime.fromtimestamp(
                    os.path.getmtime(d))
            except os.error as e:
                logger.warning(
                    'GC ignored "{}": could not get timestamp: {}'.format(
                        d, e))
                continue
            tdiff = datetime.datetime.now() - file_modified
            if tdiff > datetime.timedelta(days=expiration):
                logger.info(
                    'GC: removing expired directory: {} (modified {} ago)'.
                    format(d, tdiff))
                removed.append(d)
                shutil.rmtree(d, ignore_errors=True)
            else:
                logger.debug('GC: not removing: {} (modified {} ago)'.format(
                    d, tdiff))
        for r in removed:
            dirs.remove(r)

        ### Check free space and remove old directories
        free_space = free_space_in_path(datapath)
        logger.info("Required space in GB: {} (free = {})".format(
            required_space, free_space))

        times = []
        for d in dirs:
            try:
                t = os.path.getmtime(d)
                times.append([t, d])
            except:
                pass
        times.sort()
        logger.debug("Directories sorted by time: {}".format(times))
        dirs = [x[1] for x in times]

        busy_dirs = []
        while free_space < self.min_free_space and len(dirs) > 0:
            d = dirs.pop(0)
            if is_dir_busy(d):
                busy_dirs.append(d)
            else:
                free_space = self.remove_dir(d)

        while free_space < self.min_free_space:
            if len(busy_dirs) == 0:
                logger.error(
                    "GC: free space {} < {} GB; waiting for system space to be available..."
                    .format(free_space, self.min_free_space))
                time.sleep(60)
            else:
                logger.warning(
                    "GC: free space {} < {} GB; waiting for jobs to complete to reclaim space: {} busy directories..."
                    .format(free_space, self.min_free_space, len(busy_dirs)))
                checked_dirs = []
                while free_space < self.min_free_space and len(busy_dirs) > 0:
                    bd = busy_dirs.pop(0)
                    if is_dir_busy(bd):
                        checked_dirs.append(bd)
                        continue
                    free_space = self.remove_dir(bd)
                    # self.remove_empty_dirs()
                if free_space < self.min_free_space:
                    busy_dirs = checked_dirs
                    time.sleep(20)
            free_space = free_space_in_path(self.datapath)

        self.remove_empty_dirs()

    def remove_dir(self, d):
        shutil.rmtree(d, ignore_errors=True)
        logger.info("GC: space required; %s removed." % d)
        return free_space_in_path(self.datapath)

    def remove_empty_dirs(self):
        data_dirs = filter(lambda f: os.path.isdir(f),
                           glob.glob(self.datapath + '/' + '*/' * 2))
        for dd in data_dirs:
            if not os.listdir(dd):
                logger.info('GC: removing empty directory: {}'.format(dd))
                try:
                    os.rmdir(dd)
                except os.error as e:
                    logger.warning(
                        'GC: could not remove empty dir "{}": {}'.format(
                            dd, e))

    def get_data(self, body):
        """Get data from cache or Shock server."""
        params = json.loads(body)
        logger.debug('New Data Format')
        return self._get_data(body)

    def _get_data(self, body):
        params = json.loads(body)
        filepath = os.path.join(self.datapath, params['ARASTUSER'],
                                str(params['data_id']))
        datapath = filepath
        filepath += "/raw/"
        all_files = []
        user = params['ARASTUSER']
        job_id = params['job_id']
        data_id = params['data_id']
        token = params['oauth_token']
        uid = params['_id']

        self.gc_lock.acquire()
        try:
            self.garbage_collect(self.datapath, self.min_free_space, user,
                                 job_id, data_id)
        except:
            logger.error('Unexpected error in GC.')
            raise
        finally:
            self.gc_lock.release()

        ##### Get data from ID #####
        data_doc = self.metadata.get_data_docs(params['ARASTUSER'],
                                               params['data_id'])
        if not data_doc:
            raise Exception('Invalid Data ID: {}'.format(params['data_id']))
        logger.debug('data_doc = {}'.format(data_doc))
        if 'kbase_assembly_input' in data_doc:
            params['assembly_data'] = kb_to_asm(
                data_doc['kbase_assembly_input'])
        elif 'assembly_data' in data_doc:
            params['assembly_data'] = data_doc['assembly_data']

        ##### Get data from assembly_data #####
        self.metadata.update_job(uid, 'status', 'Data transfer')
        with ignored(OSError):
            os.makedirs(filepath)
            touch(filepath)

        file_sets = params['assembly_data']['file_sets']
        for file_set in file_sets:
            if file_set['type'] == 'paired_url':
                file_set['type'] = 'paired'
            elif file_set['type'] == 'single_url':
                file_set['type'] = 'single'
            elif file_set['type'] == 'reference_url':
                file_set['type'] = 'reference'
            file_set['files'] = []  #legacy
            for file_info in file_set['file_infos']:
                #### File is stored on Shock
                if file_info['filename']:
                    local_file = os.path.join(filepath, file_info['filename'])
                    if os.path.exists(local_file):
                        local_file = self.extract_file(local_file)
                        logger.info("Requested data exists on node: {}".format(
                            local_file))
                    else:
                        local_file = self.download_shock(
                            file_info['shock_url'], user, token,
                            file_info['shock_id'], filepath)

                elif file_info['direct_url']:
                    local_file = os.path.join(
                        filepath, os.path.basename(file_info['direct_url']))
                    if os.path.exists(local_file):
                        local_file = self.extract_file(local_file)
                        logger.info("Requested data exists on node: {}".format(
                            local_file))
                    else:
                        local_file = self.download_url(file_info['direct_url'],
                                                       filepath,
                                                       token=token)
                file_info['local_file'] = local_file
                if file_set['type'] == 'single' and asm.is_long_read_file(
                        local_file):
                    if not 'tags' in file_set:
                        file_set['tags'] = []
                    if not 'long_read' in file_set['tags']:
                        file_set['tags'].append(
                            'long_read')  # pacbio or nanopore reads
                file_set['files'].append(local_file)  #legacy
            all_files.append(file_set)
        return datapath, all_files

    def prepare_job_data(self, body):
        params = json.loads(body)
        job_id = params['job_id']

        ### Download files (if necessary)
        datapath, all_files = self.get_data(body)
        rawpath = datapath + '/raw/'
        jobpath = os.path.join(datapath, str(job_id))

        try:
            os.makedirs(jobpath)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise

        ### Protect data directory from GC before any job starts
        touch(os.path.join(rawpath, "_READY_"))

        ### Create job log
        self.out_report_name = '{}/{}_report.txt'.format(jobpath, str(job_id))
        self.out_report = open(self.out_report_name, 'w')

        ### Create data to pass to pipeline
        reads = []
        reference = []
        contigs = []
        for fileset in all_files:
            if len(fileset['files']) != 0:
                if (fileset['type'] == 'single'
                        or fileset['type'] == 'paired'):
                    reads.append(fileset)
                elif fileset['type'] == 'reference':
                    reference.append(fileset)
                elif fileset['type'] == 'contigs':
                    contigs.append(fileset)
                else:
                    raise Exception('fileset error')

        job_data = ArastJob({
            'job_id': params['job_id'],
            'uid': params['_id'],
            'user': params['ARASTUSER'],
            'reads': reads,
            'logfiles': [],
            'reference': reference,
            'contigs': contigs,
            'initial_reads': list(reads),
            'raw_reads': copy.deepcopy(reads),
            'params': [],
            'exceptions': [],
            'pipeline_data': {},
            'datapath': datapath,
            'out_report': self.out_report
        })

        self.out_report.write("Arast Pipeline: Job {}\n".format(job_id))

        return job_data

    def compute(self, body):
        self.job_list_lock.acquire()
        try:
            job_data = self.prepare_job_data(body)
            self.job_list.append(job_data)
        except:
            logger.error("Error in adding new job to job_list")
            raise
        finally:
            self.job_list_lock.release()

        status = ''
        logger.debug('job_data = {}'.format(job_data))

        params = json.loads(body)
        job_id = params['job_id']
        data_id = params['data_id']
        uid = params['_id']
        user = params['ARASTUSER']
        token = params['oauth_token']
        pipelines = params.get('pipeline')
        recipe = params.get('recipe')
        wasp_in = params.get('wasp')
        jobpath = os.path.join(self.datapath, user, str(data_id), str(job_id))

        url = shock.verify_shock_url(self.shockurl)

        self.start_time = time.time()

        timer_thread = UpdateTimer(self.metadata, 29, time.time(), uid,
                                   self.done_flag)
        timer_thread.start()

        #### Parse pipeline to wasp exp
        reload(recipes)
        if recipe:
            try:
                wasp_exp = recipes.get(recipe[0], job_id)
            except AttributeError:
                raise Exception('"{}" recipe not found.'.format(recipe[0]))
        elif wasp_in:
            wasp_exp = wasp_in[0]
        elif not pipelines:
            wasp_exp = recipes.get('auto', job_id)
        elif pipelines:
            ## Legacy client
            if pipelines[0] == 'auto':
                wasp_exp = recipes.get('auto', job_id)
            ##########
            else:
                if type(pipelines[0]) is not list:  # --assemblers
                    pipelines = [pipelines]
                all_pipes = []
                for p in pipelines:
                    all_pipes += self.pmanager.parse_input(p)
                logger.debug("pipelines = {}".format(all_pipes))
                wasp_exp = wasp.pipelines_to_exp(all_pipes, params['job_id'])
        else:
            raise asmtypes.ArastClientRequestError('Malformed job request.')
        logger.debug('Wasp Expression: {}'.format(wasp_exp))
        w_engine = wasp.WaspEngine(self.pmanager, job_data, self.metadata)

        ###### Run Job
        try:
            w_engine.run_expression(wasp_exp, job_data)
            ###### Upload all result files and place them into appropriate tags
            uploaded_fsets = job_data.upload_results(url, token)

            # Format report
            new_report = open('{}.tmp'.format(self.out_report_name), 'w')

            ### Log errors
            if len(job_data['errors']) > 0:
                new_report.write('PIPELINE ERRORS\n')
                for i, e in enumerate(job_data['errors']):
                    new_report.write('{}: {}\n'.format(i, e))
            try:  ## Get Quast output
                quast_report = job_data['wasp_chain'].find_module(
                    'quast')['data'].find_type('report')[0].files[0]
                with open(quast_report) as q:
                    new_report.write(q.read())
            except:
                new_report.write('No Summary File Generated!\n\n\n')
            self.out_report.close()
            with open(self.out_report_name) as old:
                new_report.write(old.read())

            for log in job_data['logfiles']:
                new_report.write('\n{1} {0} {1}\n'.format(
                    os.path.basename(log), '=' * 20))
                with open(log) as l:
                    new_report.write(l.read())

            ### Log tracebacks
            if len(job_data['tracebacks']) > 0:
                new_report.write('EXCEPTION TRACEBACKS\n')
                for i, e in enumerate(job_data['tracebacks']):
                    new_report.write('{}: {}\n'.format(i, e))

            new_report.close()
            os.remove(self.out_report_name)
            shutil.move(new_report.name, self.out_report_name)
            res = self.upload(url, user, token, self.out_report_name)
            report_info = asmtypes.FileInfo(self.out_report_name,
                                            shock_url=url,
                                            shock_id=res['data']['id'])

            self.metadata.update_job(
                uid, 'report', [asmtypes.set_factory('report', [report_info])])
            status = 'Complete with errors' if job_data.get(
                'errors') else 'Complete'

            ## Make compatible with JSON dumps()
            del job_data['out_report']
            del job_data['initial_reads']
            del job_data['raw_reads']
            self.metadata.update_job(uid, 'data', job_data)
            self.metadata.update_job(uid, 'result_data', uploaded_fsets)
            ###### Legacy Support #######
            filesets = uploaded_fsets.append(
                asmtypes.set_factory('report', [report_info]))
            contigsets = [
                fset for fset in uploaded_fsets
                if fset.type == 'contigs' or fset.type == 'scaffolds'
            ]
            download_ids = {
                fi['filename']: fi['shock_id']
                for fset in uploaded_fsets for fi in fset['file_infos']
            }
            contig_ids = {
                fi['filename']: fi['shock_id']
                for fset in contigsets for fi in fset['file_infos']
            }
            self.metadata.update_job(uid, 'result_data_legacy', [download_ids])
            self.metadata.update_job(uid, 'contig_ids', [contig_ids])
            ###################

            sys.stdout.flush()
            touch(os.path.join(jobpath, "_DONE_"))
            logger.info('============== JOB COMPLETE ===============')

        except asmtypes.ArastUserInterrupt:
            status = 'Terminated by user'
            sys.stdout.flush()
            touch(os.path.join(jobpath, "_CANCELLED__"))
            logger.info('============== JOB KILLED ===============')

        finally:
            self.remove_job_from_lists(job_data)
            logger.debug('Reinitialize plugin manager...'
                         )  # Reinitialize to get live changes
            self.pmanager = ModuleManager(self.threads, self.kill_list,
                                          self.kill_list_lock, self.job_list,
                                          self.binpath, self.modulebin)

        self.metadata.update_job(uid, 'status', status)

    def remove_job_from_lists(self, job_data):
        self.job_list_lock.acquire()
        try:
            for i, job in enumerate(self.job_list):
                if job['user'] == job_data['user'] and job[
                        'job_id'] == job_data['job_id']:
                    self.job_list.pop(i)
        except:
            logger.error(
                "Unexpected error in removing executed jobs from job_list")
            raise
        finally:
            self.job_list_lock.release()

        # kill_list cleanup for cases where a kill request is enqueued right before the corresponding job gets popped
        self.kill_list_lock.acquire()
        try:
            for i, kill_request in enumerate(self.kill_list):
                if kill_request['user'] == job_data['user'] and kill_request[
                        'job_id'] == job_data['job_id']:
                    self.kill_list.pop(i)
        except:
            logger.error(
                "Unexpected error in removing executed jobs from kill_list")
            raise
        finally:
            self.kill_list_lock.release()

    def upload(self, url, user, token, file, filetype='default'):
        files = {}
        files["file"] = (os.path.basename(file), open(file, 'rb'))
        logger.debug("Message sent to shock on upload: %s" % files)
        sclient = shock.Shock(url, user, token)
        if filetype == 'contigs' or filetype == 'scaffolds':
            res = sclient.upload_contigs(file)
        else:
            res = sclient.upload_file(file, filetype, curl=True)
        return res

    def download_shock(self, url, user, token, node_id, outdir):
        sclient = shock.Shock(url, user, token)
        downloaded = sclient.curl_download_file(node_id, outdir=outdir)
        return self.extract_file(downloaded)

    def download_url(self, url, outdir, token=None):
        downloaded = shock.curl_download_url(url, outdir=outdir, token=token)
        return self.extract_file(downloaded)

    def fetch_job(self):
        connection = pika.BlockingConnection(
            pika.ConnectionParameters(host=self.rmq_host, port=self.rmq_port))
        channel = connection.channel()
        channel.basic_qos(prefetch_count=1)
        result = channel.queue_declare(exclusive=False,
                                       auto_delete=False,
                                       durable=True)
        logger.info('Fetching job...')

        channel.basic_qos(prefetch_count=1)
        for queue in self.queues:
            print 'Using queue: {}'.format(queue)
            channel.basic_consume(self.callback, queue=queue)

        channel.start_consuming()

    def callback(self, ch, method, properties, body):
        params = json.loads(body)
        display = [
            'ARASTUSER', 'job_id', 'message', 'recipe', 'pipeline', 'wasp'
        ]
        logger.info('Incoming job: ' + ', '.join(
            ['{}: {}'.format(k, params[k]) for k in display if params[k]]))
        logger.debug(params)
        job_doc = self.metadata.get_job(params['ARASTUSER'], params['job_id'])

        ## Check if job was not killed
        if job_doc is None:
            logger.error('Error: no job_doc found for {}'.format(
                params.get('job_id')))
            return

        if job_doc.get('status') == 'Terminated by user':
            logger.warn('Job {} was killed, skipping'.format(
                params.get('job_id')))
        else:
            self.done_flag = threading.Event()
            uid = None
            try:
                uid = job_doc['_id']
                self.compute(body)
            except Exception as e:
                tb = format_exc()
                status = "[FAIL] {}".format(e)
                logger.error("{}\n{}".format(status, tb))
                self.metadata.update_job(uid, 'status', status)
        ch.basic_ack(delivery_tag=method.delivery_tag)
        self.done_flag.set()

    def start(self):
        self.fetch_job()

    def extract_file(self, filename):
        """ Decompress files if necessary """
        unp_bin = os.path.join(self.modulebin, 'unp')

        filepath = os.path.dirname(filename)
        uncompressed = ['fasta', 'fa', 'fastq', 'fq', 'fna', 'h5']
        supported = [
            'tar.gz', 'tar.bz2', 'bz2', 'gz', 'lz', 'rar', 'tar', 'tgz', 'zip'
        ]
        for ext in uncompressed:
            if filename.endswith('.' + ext):
                return filename
        for ext in supported:
            if filename.endswith('.' + ext):
                extracted_file = filename[:filename.index(ext) - 1]
                if os.path.exists(extracted_file):  # Check extracted already
                    return extracted_file
                logger.info("Extracting {}...".format(filename))
                # p = subprocess.Popen([unp_bin, filename],
                #                      cwd=filepath, stderr=subprocess.STDOUT)
                # p.wait()
                # Hide the "broken pipe" message from unp
                out = subprocess.Popen(
                    [unp_bin, filename],
                    cwd=filepath,
                    stdout=subprocess.PIPE,
                    stderr=subprocess.STDOUT).communicate()[0]
                if os.path.exists(extracted_file):
                    return extracted_file
                else:
                    logger.error("Extraction of {} failed: {}".format(
                        filename, out))
                    raise Exception('Archive structure error')
        logger.error("Could not extract {}".format(filename))
        return filename
Пример #11
0
    def compute(self, body):
        self.job_list_lock.acquire()
        try:
            job_data = self.prepare_job_data(body)
            self.job_list.append(job_data)
        except:
            logger.error("Error in adding new job to job_list")
            raise
        finally:
            self.job_list_lock.release()

        status = ''
        logger.debug('job_data = {}'.format(job_data))

        params = json.loads(body)
        job_id = params['job_id']
        data_id = params['data_id']
        uid = params['_id']
        user = params['ARASTUSER']
        token = params['oauth_token']
        pipelines = params.get('pipeline')
        recipe = params.get('recipe')
        wasp_in = params.get('wasp')
        jobpath = os.path.join(self.datapath, user, str(data_id), str(job_id))

        url = shock.verify_shock_url(self.shockurl)

        self.start_time = time.time()

        timer_thread = UpdateTimer(self.metadata, 29, time.time(), uid,
                                   self.done_flag)
        timer_thread.start()

        #### Parse pipeline to wasp exp
        reload(recipes)
        if recipe:
            try:
                wasp_exp = recipes.get(recipe[0], job_id)
            except AttributeError:
                raise Exception('"{}" recipe not found.'.format(recipe[0]))
        elif wasp_in:
            wasp_exp = wasp_in[0]
        elif not pipelines:
            wasp_exp = recipes.get('auto', job_id)
        elif pipelines:
            ## Legacy client
            if pipelines[0] == 'auto':
                wasp_exp = recipes.get('auto', job_id)
            ##########
            else:
                if type(pipelines[0]) is not list:  # --assemblers
                    pipelines = [pipelines]
                all_pipes = []
                for p in pipelines:
                    all_pipes += self.pmanager.parse_input(p)
                logger.debug("pipelines = {}".format(all_pipes))
                wasp_exp = wasp.pipelines_to_exp(all_pipes, params['job_id'])
        else:
            raise asmtypes.ArastClientRequestError('Malformed job request.')
        logger.debug('Wasp Expression: {}'.format(wasp_exp))
        w_engine = wasp.WaspEngine(self.pmanager, job_data, self.metadata)

        ###### Run Job
        try:
            w_engine.run_expression(wasp_exp, job_data)
            ###### Upload all result files and place them into appropriate tags
            uploaded_fsets = job_data.upload_results(url, token)

            # Format report
            new_report = open('{}.tmp'.format(self.out_report_name), 'w')

            ### Log errors
            if len(job_data['errors']) > 0:
                new_report.write('PIPELINE ERRORS\n')
                for i, e in enumerate(job_data['errors']):
                    new_report.write('{}: {}\n'.format(i, e))
            try:  ## Get Quast output
                quast_report = job_data['wasp_chain'].find_module(
                    'quast')['data'].find_type('report')[0].files[0]
                with open(quast_report) as q:
                    new_report.write(q.read())
            except:
                new_report.write('No Summary File Generated!\n\n\n')
            self.out_report.close()
            with open(self.out_report_name) as old:
                new_report.write(old.read())

            for log in job_data['logfiles']:
                new_report.write('\n{1} {0} {1}\n'.format(
                    os.path.basename(log), '=' * 20))
                with open(log) as l:
                    new_report.write(l.read())

            ### Log tracebacks
            if len(job_data['tracebacks']) > 0:
                new_report.write('EXCEPTION TRACEBACKS\n')
                for i, e in enumerate(job_data['tracebacks']):
                    new_report.write('{}: {}\n'.format(i, e))

            new_report.close()
            os.remove(self.out_report_name)
            shutil.move(new_report.name, self.out_report_name)
            res = self.upload(url, user, token, self.out_report_name)
            report_info = asmtypes.FileInfo(self.out_report_name,
                                            shock_url=url,
                                            shock_id=res['data']['id'])

            self.metadata.update_job(
                uid, 'report', [asmtypes.set_factory('report', [report_info])])
            status = 'Complete with errors' if job_data.get(
                'errors') else 'Complete'

            ## Make compatible with JSON dumps()
            del job_data['out_report']
            del job_data['initial_reads']
            del job_data['raw_reads']
            self.metadata.update_job(uid, 'data', job_data)
            self.metadata.update_job(uid, 'result_data', uploaded_fsets)
            ###### Legacy Support #######
            filesets = uploaded_fsets.append(
                asmtypes.set_factory('report', [report_info]))
            contigsets = [
                fset for fset in uploaded_fsets
                if fset.type == 'contigs' or fset.type == 'scaffolds'
            ]
            download_ids = {
                fi['filename']: fi['shock_id']
                for fset in uploaded_fsets for fi in fset['file_infos']
            }
            contig_ids = {
                fi['filename']: fi['shock_id']
                for fset in contigsets for fi in fset['file_infos']
            }
            self.metadata.update_job(uid, 'result_data_legacy', [download_ids])
            self.metadata.update_job(uid, 'contig_ids', [contig_ids])
            ###################

            sys.stdout.flush()
            touch(os.path.join(jobpath, "_DONE_"))
            logger.info('============== JOB COMPLETE ===============')

        except asmtypes.ArastUserInterrupt:
            status = 'Terminated by user'
            sys.stdout.flush()
            touch(os.path.join(jobpath, "_CANCELLED__"))
            logger.info('============== JOB KILLED ===============')

        finally:
            self.remove_job_from_lists(job_data)
            logger.debug('Reinitialize plugin manager...'
                         )  # Reinitialize to get live changes
            self.pmanager = ModuleManager(self.threads, self.kill_list,
                                          self.kill_list_lock, self.job_list,
                                          self.binpath, self.modulebin)

        self.metadata.update_job(uid, 'status', status)
Пример #12
0
class ArastConsumer:
    def __init__(self, shockurl, arasturl, config, threads, queue, kill_queue, job_list, ctrl_conf, datapath, binpath):
        self.parser = SafeConfigParser()
        self.parser.read(config)
        self.job_list = job_list
        # Load plugins
        self.pmanager = ModuleManager(threads, kill_queue, job_list, binpath)

    # Set up environment
        self.shockurl = shockurl
        self.arasturl = arasturl
        self.datapath = datapath
        if queue:
            self.queue = queue
            logging.info('Using queue:{}'.format(self.queue))
        else:
            self.queue = self.parser.get('rabbitmq','default_routing_key')
        self.min_free_space = float(self.parser.get('compute','min_free_space'))
        m = ctrl_conf['meta']        
        a = ctrl_conf['assembly']
        

        ###### TODO Use REST API
        self.metadata = meta.MetadataConnection(arasturl, int(a['mongo_port']), m['mongo.db'],
                                                m['mongo.collection'], m['mongo.collection.auth'], m['mongo.collection.data'] )
        self.gc_lock = multiprocessing.Lock()

    def garbage_collect(self, datapath, user, required_space):
        """ Monitor space of disk containing DATAPATH and delete files if necessary."""
        self.gc_lock.acquire()
        s = os.statvfs(datapath)
        free_space = float(s.f_bsize * s.f_bavail)
        logging.debug("Free space in bytes: %s" % free_space)
        logging.debug("Required space in bytes: %s" % required_space)
        while ((free_space - self.min_free_space) < required_space):
            #Delete old data
            dirs = os.listdir(os.path.join(datapath, user))
            times = []
            for dir in dirs:
                times.append(os.path.getmtime(os.path.join(datapath, user, dir)))
            if len(dirs) > 0:
                old_dir = os.path.join(datapath, user, dirs[times.index(min(times))])
                shutil.rmtree(old_dir, ignore_errors=True)
            else:
                logging.error("No more directories to remove")
                break
            logging.info("Space required.  %s removed." % old_dir)
            s = os.statvfs(datapath)
            free_space = float(s.f_bsize * s.f_bavail)
            logging.debug("Free space in bytes: %s" % free_space)
        self.gc_lock.release()


    def get_data(self, body):
        """Get data from cache or Shock server."""
        params = json.loads(body)
        if ('assembly_data' in params or
            params['version'] == 'widget'):
            logging.info('New Data Format')
            return self._get_data(body)

        else:
            return self._get_data_old(body)

    def _get_data(self, body):
        params = json.loads(body)
        filepath = os.path.join(self.datapath, params['ARASTUSER'],
                                str(params['data_id']))
        datapath = filepath
        filepath += "/raw/"
        all_files = []
        user = params['ARASTUSER']
        token = params['oauth_token']
        uid = params['_id']

        ##### Get data from ID #####
        data_doc = self.metadata.get_data_docs(params['ARASTUSER'], params['data_id'])
        if not data_doc:
            raise Exception('Invalid Data ID: {}'.format(params['data_id']))

        if 'kbase_assembly_input' in data_doc:
            params['assembly_data'] = kb_to_asm(data_doc['kbase_assembly_input'])
        elif 'assembly_data' in data_doc:
            params['assembly_data'] = data_doc['assembly_data']

        ##### Get data from assembly_data #####
        self.metadata.update_job(uid, 'status', 'Data transfer')
        try:os.makedirs(filepath)
        except:pass
            
          ### TODO Garbage collect ###
        download_url = 'http://{}'.format(self.shockurl)
        file_sets = params['assembly_data']['file_sets']
        for file_set in file_sets:
            if file_set['type'] == 'paired_url':
                file_set['type'] = 'paired'
            elif file_set['type'] == 'single_url':
                file_set['type'] = 'single'
            elif file_set['type'] == 'reference_url':
                file_set['type'] = 'reference'
            file_set['files'] = [] #legacy
            for file_info in file_set['file_infos']:
                #### File is stored on Shock
                if file_info['filename']:
                    local_file = os.path.join(filepath, file_info['filename'])
                    if os.path.exists(local_file):
                        logging.info("Requested data exists on node: {}".format(local_file))
                    else:
                        local_file = self.download_shock(download_url, user, token, 
                                                   file_info['shock_id'], filepath)
                elif file_info['direct_url']:
                    local_file = os.path.join(filepath, os.path.basename(file_info['direct_url']))
                    if os.path.exists(local_file):
                        logging.info("Requested data exists on node: {}".format(local_file))
                    else:
                        local_file = self.download_url(file_info['direct_url'], filepath)
                file_info['local_file'] = local_file
                file_set['files'].append(local_file) #legacy
            all_files.append(file_set)
        return datapath, all_files                    

    def compute(self, body):
        error = False
        params = json.loads(body)
        job_id = params['job_id']
        uid = params['_id']
        user = params['ARASTUSER']
        token = params['oauth_token']
        pipelines = params['pipeline']
        recipe = None
        wasp_in = None
        try: ## In case legacy
            recipe = params['recipe']
            wasp_in = params['wasp']
        except:pass

        #support legacy arast client
        if len(pipelines) > 0:
            if type(pipelines[0]) is not list:
                pipelines = [pipelines]
                
        ### Download files (if necessary)
        datapath, all_files = self.get_data(body)
        rawpath = datapath + '/raw/'
        jobpath = os.path.join(datapath, str(job_id))
        try:
            os.makedirs(jobpath)
        except Exception as e:
            print e
            raise Exception ('Data Error')

        ### Create job log
        self.out_report_name = '{}/{}_report.txt'.format(jobpath, str(job_id))
        self.out_report = open(self.out_report_name, 'w')

        ### Create data to pass to pipeline
        reads = []
        reference = []
        for fileset in all_files:
            if len(fileset['files']) != 0:
                if (fileset['type'] == 'single' or 
                    fileset['type'] == 'paired'):
                    reads.append(fileset)
                elif fileset['type'] == 'reference':
                    reference.append(fileset)
                else:
                    raise Exception('fileset error')

        job_data = ArastJob({'job_id' : params['job_id'], 
                    'uid' : params['_id'],
                    'user' : params['ARASTUSER'],
                    'reads': reads,
                    'logfiles': [],
                    'reference': reference,
                    'initial_reads': list(reads),
                    'raw_reads': copy.deepcopy(reads),
                    'params': [],
                    'exceptions': [],
                    'pipeline_data': {},
                    'datapath': datapath,
                    'out_report' : self.out_report})
                    
        self.out_report.write("Arast Pipeline: Job {}\n".format(job_id))
        self.job_list.append(job_data)
        self.start_time = time.time()

        timer_thread = UpdateTimer(self.metadata, 29, time.time(), uid, self.done_flag)
        timer_thread.start()
        
        url = "http://%s" % (self.shockurl)
        status = ''

        #### Parse pipeline to wasp exp
        wasp_exp = pipelines[0][0]
        reload(recipes)
        if recipe:
            try: wasp_exp = recipes.get(recipe[0])
            except AttributeError: raise Exception('"{}" recipe not found.'.format(recipe[0]))
        elif wasp_in:
            wasp_exp = wasp_in[0]
        elif pipelines[0] == 'auto':
            wasp_exp = recipes.get('auto')
        else:
            all_pipes = []
            for p in pipelines:
                all_pipes += self.pmanager.parse_input(p)
            print all_pipes
            wasp_exp = wasp.pipelines_to_exp(all_pipes, params['job_id'])
            logging.info('Wasp Expression: {}'.format(wasp_exp))
        print('Wasp Expression: {}'.format(wasp_exp))
        w_engine = wasp.WaspEngine(self.pmanager, job_data, self.metadata)
        w_engine.run_expression(wasp_exp, job_data)

        ###### Upload all result files and place them into appropriate tags
        uploaded_fsets = job_data.upload_results(url, token)
        
        for i, job in enumerate(self.job_list):
            if job['user'] == job_data['user'] and job['job_id'] == job_data['job_id']:
                self.job_list.pop(i)


        # Format report
        new_report = open('{}.tmp'.format(self.out_report_name), 'w')

        ### Log exceptions
        if len(job_data['exceptions']) > 0:
            new_report.write('PIPELINE ERRORS\n')
            for i,e in enumerate(job_data['exceptions']):
                new_report.write('{}: {}\n'.format(i, e))
        try: ## Get Quast output
            quast_report = job_data['wasp_chain'].find_module('quast')['data'].find_type('report')[0].files[0]
            with open(quast_report) as q:
                new_report.write(q.read())
        except:
            new_report.write('No Summary File Generated!\n\n\n')
        self.out_report.close()
        with open(self.out_report_name) as old:
            new_report.write(old.read())

        for log in job_data['logfiles']:
            new_report.write('\n{1} {0} {1}\n'.format(os.path.basename(log), '='*20))
            with open(log) as l:
                new_report.write(l.read())
        new_report.close()
        os.remove(self.out_report_name)
        shutil.move(new_report.name, self.out_report_name)
        res = self.upload(url, user, token, self.out_report_name)
        report_info = asmtypes.FileInfo(self.out_report_name, shock_url=url, shock_id=res['data']['id'])

        self.metadata.update_job(uid, 'report', [asmtypes.set_factory('report', [report_info])])
        status = 'Complete with errors' if job_data['exceptions'] else 'Complete'

        ## Make compatible with JSON dumps()
        del job_data['out_report']
        del job_data['initial_reads']
        del job_data['raw_reads']
        self.metadata.update_job(uid, 'data', job_data)
        self.metadata.update_job(uid, 'result_data', uploaded_fsets)
        self.metadata.update_job(uid, 'status', status)

        ###### Legacy Support #######
        filesets = uploaded_fsets.append(asmtypes.set_factory('report', [report_info]))
        contigsets = [fset for fset in uploaded_fsets if fset.type == 'contigs' or fset.type == 'scaffolds']
        download_ids = {fi['filename']: fi['shock_id'] for fset in uploaded_fsets for fi in fset['file_infos']}
        contig_ids = {fi['filename']: fi['shock_id'] for fset in contigsets for fi in fset['file_infos']}
        self.metadata.update_job(uid, 'result_data_legacy', [download_ids])
        self.metadata.update_job(uid, 'contig_ids', [contig_ids])
        ###################

        print '============== JOB COMPLETE ==============='

    def upload(self, url, user, token, file, filetype='default'):
        files = {}
        files["file"] = (os.path.basename(file), open(file, 'rb'))
        logging.debug("Message sent to shock on upload: %s" % files)
        sclient = shock.Shock(url, user, token)
        if filetype == 'contigs' or filetype == 'scaffolds':
            res = sclient.upload_contigs(file)
        else:
            res = sclient.upload_misc(file, filetype)
        return res

    def download_shock(self, url, user, token, node_id, outdir):
        sclient = shock.Shock(url, user, token)
        downloaded = sclient.curl_download_file(node_id, outdir=outdir)
        return extract_file(downloaded)

    def download_url(self, url, outdir):
        downloaded = asm.curl_download_url(url, outdir=outdir)
        return extract_file(downloaded)

    def fetch_job(self):
        connection = pika.BlockingConnection(pika.ConnectionParameters(
                host = self.arasturl))
        channel = connection.channel()
        channel.basic_qos(prefetch_count=1)
        result = channel.queue_declare(queue=self.queue,
                                       exclusive=False,
                                       auto_delete=False,
                                       durable=True)
        logging.basicConfig(format=("%(asctime)s %s %(levelname)-8s %(message)s",proc().name))
        print proc().name, ' [*] Fetching job...'

        channel.basic_qos(prefetch_count=1)
        channel.basic_consume(self.callback,
                              queue=self.queue)

        channel.start_consuming()

    def callback(self, ch, method, properties, body):
        params = json.loads(body)
        display = ['ARASTUSER', 'job_id', 'message']
        print ' [+] Incoming:', ', '.join(['{}: {}'.format(k, params[k]) for k in display])
        logging.info(params)
        job_doc = self.metadata.get_job(params['ARASTUSER'], params['job_id'])
        uid = job_doc['_id']
        ## Check if job was not killed
        if job_doc['status'] == 'Terminated':
            print 'Job {} was killed, skipping'.format(params['job_id'])
        else:
            self.done_flag = threading.Event()
            try:
                self.compute(body)
            except Exception as e:
                tb = format_exc()
                status = "[FAIL] {}".format(e)
                print e
                print logging.error(tb) 
                self.metadata.update_job(uid, 'status', status)
        ch.basic_ack(delivery_tag=method.delivery_tag)
        self.done_flag.set()

    def start(self):
        self.fetch_job()

###### Legacy Support ######

    def _get_data_old(self, body):
        params = json.loads(body)
        #filepath = self.datapath + str(params['data_id'])
        filepath = os.path.join(self.datapath, params['ARASTUSER'],
                                str(params['data_id']))
        datapath = filepath
        filepath += "/raw/"
        all_files = []

        uid = params['_id']
        job_id = params['job_id']
        user = params['ARASTUSER']

        data_doc = self.metadata.get_doc_by_data_id(params['data_id'], params['ARASTUSER'])
        if data_doc:
            paired = data_doc['pair']
            single = data_doc['single']
            files = data_doc['filename']
            ids = data_doc['ids']
            token = params['oauth_token']
            try:
                ref = data_doc['reference']
            except:
                pass
        else:
            self.metadata.update_job(uid, 'status', 'Invalid Data ID')
            raise Exception('Data {} does not exist on Shock Server'.format(
                    params['data_id']))

        all_files = []
        if os.path.isdir(filepath):
            logging.info("Requested data exists on node")
            try:
                for l in paired:
                    filedict = {'type':'paired', 'files':[]}
                    for word in l:
                        if is_filename(word):
                            baseword = os.path.basename(word)
                            filedict['files'].append(
                                extract_file(os.path.join(filepath,  baseword)))
                        else:
                            kv = word.split('=')
                            filedict[kv[0]] = kv[1]
                    all_files.append(filedict)
            except:
                logging.info('No paired files submitted')

            try:
                for seqfiles in single:
                    for wordpath in seqfiles:
                        filedict = {'type':'single', 'files':[]}    
                        if is_filename(wordpath):
                            baseword = os.path.basename(wordpath)
                            filedict['files'].append(
                                extract_file(os.path.join(filepath, baseword)))
                        else:
                            kv = word.split('=')
                            filedict[kv[0]] = kv[1]
                        all_files.append(filedict)
            except:
                logging.info(format_tb(sys.exc_info()[2]))
                logging.info('No single files submitted!')
            
            try:
                for r in ref:
                    for wordpath in r:
                        filedict = {'type':'reference', 'files':[]}    
                        if is_filename(wordpath):
                            baseword = os.path.basename(wordpath)
                            filedict['files'].append(
                                extract_file(os.path.join(filepath, baseword)))
                        else:
                            kv = word.split('=')
                            filedict[kv[0]] = kv[1]
                        all_files.append(filedict)
            except:
                logging.info(format_tb(sys.exc_info()[2]))
                logging.info('No reference files submitted!')
            
    
            touch(datapath)

        ## Data does not exist on current compute node
        else:
            self.metadata.update_job(uid, 'status', 'Data transfer')
            os.makedirs(filepath)

            # Get required space and garbage collect
            try:
                req_space = 0
                for file_size in data_doc['file_sizes']:
                    req_space += file_size
                self.garbage_collect(self.datapath, user, req_space)
            except:
                pass 
            url = "http://%s" % (self.shockurl)

            try:
                for l in paired:
                    #FILEDICT contains a single read library's info
                    filedict = {'type':'paired', 'files':[]}
                    for word in l:
                        if is_filename(word):
                            baseword = os.path.basename(word)
                            dl = self.download_shock(url, user, token, 
                                               ids[files.index(baseword)], filepath)
                            if shock.parse_handle(dl): #Shock handle, get real data
                                logging.info('Found shock handle, getting real data...')
                                s_addr, s_id = shock.parse_handle(dl)
                                s_url = 'http://{}'.format(s_addr)
                                real_file = self.download_shock(s_url, user, token, 
                                                          s_id, filepath)
                                filedict['files'].append(real_file)
                            else:
                                filedict['files'].append(dl)
                        elif re.search('=', word):
                            kv = word.split('=')
                            filedict[kv[0]] = kv[1]
                    all_files.append(filedict)
            except:
                logging.info(format_exc(sys.exc_info()))
                logging.info('No paired files submitted')

            try:
                for seqfiles in single:
                    for wordpath in seqfiles:
                        filedict = {'type':'single', 'files':[]}
                        # Parse user directories
                        try:
                            path, word = wordpath.rsplit('/', 1)
                            path += '/'
                        except:
                            word = wordpath
                            path = ''

                        if is_filename(word):
                            baseword = os.path.basename(word)
                            dl = self.download_shock(url, user, token, 
                                               ids[files.index(baseword)], filepath)
                            if shock.parse_handle(dl): #Shock handle, get real data
                                logging.info('Found shock handle, getting real data...')
                                s_addr, s_id = shock.parse_handle(dl)
                                s_url = 'http://{}'.format(s_addr)
                                real_file = self.download_shock(s_url, user, token, 
                                                          s_id, filepath)
                                filedict['files'].append(real_file)
                            else:
                                filedict['files'].append(dl)
                        elif re.search('=', word):
                            kv = word.split('=')
                            filedict[kv[0]] = kv[1]
                        all_files.append(filedict)
            except:
                logging.info(format_exc(sys.exc_info()))
                logging.info('No single end files submitted')

            try:
                for r in ref:
                    for wordpath in r:
                        filedict = {'type':'reference', 'files':[]}
                        # Parse user directories
                        try:
                            path, word = wordpath.rsplit('/', 1)
                            path += '/'
                        except:
                            word = wordpath
                            path = ''

                        if is_filename(word):
                            baseword = os.path.basename(word)
                            dl = self.download_shock(url, user, token, 
                                               ids[files.index(baseword)], filepath)
                            if shock.parse_handle(dl): #Shock handle, get real data
                                logging.info('Found shock handle, getting real data...')
                                s_addr, s_id = shock.parse_handle(dl)
                                s_url = 'http://{}'.format(s_addr)
                                real_file = self.download_shock(s_url, user, token, 
                                                          s_id, filepath)
                                filedict['files'].append(real_file)
                            else:
                                filedict['files'].append(dl)
                        elif re.search('=', word):
                            kv = word.split('=')
                            filedict[kv[0]] = kv[1]
                        all_files.append(filedict)
            except:
                #logging.info(format_exc(sys.exc_info()))
                logging.info('No single end files submitted')

        return datapath, all_files
Пример #13
0
class ArastConsumer:
    def __init__(self, shockurl, arasturl, config, threads, queue, kill_queue,
                 job_list, ctrl_conf):
        self.parser = SafeConfigParser()
        self.parser.read(config)
        self.job_list = job_list
        # Load plugins
        self.pmanager = ModuleManager(threads, kill_queue, job_list)

        # Set up environment
        self.shockurl = shockurl
        self.arasturl = arasturl
        self.datapath = self.parser.get('compute', 'datapath')
        if queue:
            self.queue = queue
            print('Using queue:{}'.format(self.queue))
        else:
            self.queue = self.parser.get('rabbitmq', 'default_routing_key')
        self.min_free_space = float(
            self.parser.get('compute', 'min_free_space'))
        m = ctrl_conf['meta']
        a = ctrl_conf['assembly']

        self.metadata = meta.MetadataConnection(arasturl, int(a['mongo_port']),
                                                m['mongo.db'],
                                                m['mongo.collection'],
                                                m['mongo.collection.auth'])
        self.gc_lock = multiprocessing.Lock()

    def garbage_collect(self, datapath, user, required_space):
        """ Monitor space of disk containing DATAPATH and delete files if necessary."""
        self.gc_lock.acquire()
        s = os.statvfs(datapath)
        free_space = float(s.f_bsize * s.f_bavail)
        logging.debug("Free space in bytes: %s" % free_space)
        logging.debug("Required space in bytes: %s" % required_space)
        while ((free_space - self.min_free_space) < required_space):
            #Delete old data
            dirs = os.listdir(os.path.join(datapath, user))
            times = []
            for dir in dirs:
                times.append(
                    os.path.getmtime(os.path.join(datapath, user, dir)))
            if len(dirs) > 0:
                old_dir = os.path.join(datapath, user,
                                       dirs[times.index(min(times))])
                shutil.rmtree(old_dir, ignore_errors=True)
            else:
                logging.error("No more directories to remove")
                break
            logging.info("Space required.  %s removed." % old_dir)
            s = os.statvfs(datapath)
            free_space = float(s.f_bsize * s.f_bavail)
            logging.debug("Free space in bytes: %s" % free_space)
        self.gc_lock.release()

    def get_data(self, body):
        """Get data from cache or Shock server."""
        params = json.loads(body)
        if 'assembly_data' in params:
            logging.info('New Data Format')
            return self._get_data(body)
        else:
            return self._get_data_old(body)

    def _get_data(self, body):
        params = json.loads(body)
        filepath = os.path.join(self.datapath, params['ARASTUSER'],
                                str(params['data_id']))
        datapath = filepath
        filepath += "/raw/"
        all_files = []
        user = params['ARASTUSER']
        token = params['oauth_token']
        uid = params['_id']

        ##### Get data from ID #####
        data_doc = self.metadata.get_doc_by_data_id(params['data_id'],
                                                    params['ARASTUSER'])
        if not data_doc:
            raise Exception('Invalid Data ID: {}'.format(params['data_id']))

        if 'kbase_assembly_input' in data_doc:
            params['assembly_data'] = kb_to_asm(
                data_doc['kbase_assembly_input'])
        elif 'assembly_data' in data_doc:
            params['assembly_data'] = data_doc['assembly_data']

        ##### Get data from assembly_data #####
        self.metadata.update_job(uid, 'status', 'Data transfer')
        try:
            os.makedirs(filepath)
        except:
            pass

        ### TODO Garbage collect ###
        download_url = 'http://{}'.format(self.shockurl)
        file_sets = params['assembly_data']['file_sets']
        for file_set in file_sets:
            file_set['files'] = []  #legacy
            for file_info in file_set['file_infos']:
                local_file = os.path.join(filepath, file_info['filename'])
                if os.path.exists(local_file):
                    logging.info(
                        "Requested data exists on node: {}".format(local_file))
                else:
                    local_file = self.download(download_url, user, token,
                                               file_info['shock_id'], filepath)
                file_info['local_file'] = local_file
                file_set['files'].append(local_file)  #legacy
            all_files.append(file_set)
        return datapath, all_files

    def _get_data_old(self, body):
        params = json.loads(body)
        #filepath = self.datapath + str(params['data_id'])
        filepath = os.path.join(self.datapath, params['ARASTUSER'],
                                str(params['data_id']))
        datapath = filepath
        filepath += "/raw/"
        all_files = []

        uid = params['_id']
        job_id = params['job_id']
        user = params['ARASTUSER']

        data_doc = self.metadata.get_doc_by_data_id(params['data_id'],
                                                    params['ARASTUSER'])
        if data_doc:
            paired = data_doc['pair']
            single = data_doc['single']
            files = data_doc['filename']
            ids = data_doc['ids']
            token = params['oauth_token']
            try:
                ref = data_doc['reference']
            except:
                pass
        else:
            self.metadata.update_job(uid, 'status', 'Invalid Data ID')
            raise Exception('Data {} does not exist on Shock Server'.format(
                params['data_id']))

        all_files = []
        if os.path.isdir(filepath):
            logging.info("Requested data exists on node")
            try:
                for l in paired:
                    filedict = {'type': 'paired', 'files': []}
                    for word in l:
                        if is_filename(word):
                            baseword = os.path.basename(word)
                            filedict['files'].append(
                                extract_file(os.path.join(filepath, baseword)))
                        else:
                            kv = word.split('=')
                            filedict[kv[0]] = kv[1]
                    all_files.append(filedict)
            except:
                logging.info('No paired files submitted')

            try:
                for seqfiles in single:
                    for wordpath in seqfiles:
                        filedict = {'type': 'single', 'files': []}
                        if is_filename(wordpath):
                            baseword = os.path.basename(wordpath)
                            filedict['files'].append(
                                extract_file(os.path.join(filepath, baseword)))
                        else:
                            kv = word.split('=')
                            filedict[kv[0]] = kv[1]
                        all_files.append(filedict)
            except:
                logging.info(format_tb(sys.exc_info()[2]))
                logging.info('No single files submitted!')

            try:
                for r in ref:
                    for wordpath in r:
                        filedict = {'type': 'reference', 'files': []}
                        if is_filename(wordpath):
                            baseword = os.path.basename(wordpath)
                            filedict['files'].append(
                                extract_file(os.path.join(filepath, baseword)))
                        else:
                            kv = word.split('=')
                            filedict[kv[0]] = kv[1]
                        all_files.append(filedict)
            except:
                logging.info(format_tb(sys.exc_info()[2]))
                logging.info('No reference files submitted!')

            touch(datapath)

        ## Data does not exist on current compute node
        else:
            self.metadata.update_job(uid, 'status', 'Data transfer')
            os.makedirs(filepath)

            # Get required space and garbage collect
            try:
                req_space = 0
                for file_size in data_doc['file_sizes']:
                    req_space += file_size
                self.garbage_collect(self.datapath, user, req_space)
            except:
                pass
            url = "http://%s" % (self.shockurl)

            try:
                for l in paired:
                    #FILEDICT contains a single read library's info
                    filedict = {'type': 'paired', 'files': []}
                    for word in l:
                        if is_filename(word):
                            baseword = os.path.basename(word)
                            dl = self.download(url, user, token,
                                               ids[files.index(baseword)],
                                               filepath)
                            if shock.parse_handle(
                                    dl):  #Shock handle, get real data
                                logging.info(
                                    'Found shock handle, getting real data...')
                                s_addr, s_id = shock.parse_handle(dl)
                                s_url = 'http://{}'.format(s_addr)
                                real_file = self.download(
                                    s_url, user, token, s_id, filepath)
                                filedict['files'].append(real_file)
                            else:
                                filedict['files'].append(dl)
                        elif re.search('=', word):
                            kv = word.split('=')
                            filedict[kv[0]] = kv[1]
                    all_files.append(filedict)
            except:
                logging.info(format_exc(sys.exc_info()))
                logging.info('No paired files submitted')

            try:
                for seqfiles in single:
                    for wordpath in seqfiles:
                        filedict = {'type': 'single', 'files': []}
                        # Parse user directories
                        try:
                            path, word = wordpath.rsplit('/', 1)
                            path += '/'
                        except:
                            word = wordpath
                            path = ''

                        if is_filename(word):
                            baseword = os.path.basename(word)
                            dl = self.download(url, user, token,
                                               ids[files.index(baseword)],
                                               filepath)
                            if shock.parse_handle(
                                    dl):  #Shock handle, get real data
                                logging.info(
                                    'Found shock handle, getting real data...')
                                s_addr, s_id = shock.parse_handle(dl)
                                s_url = 'http://{}'.format(s_addr)
                                real_file = self.download(
                                    s_url, user, token, s_id, filepath)
                                filedict['files'].append(real_file)
                            else:
                                filedict['files'].append(dl)
                        elif re.search('=', word):
                            kv = word.split('=')
                            filedict[kv[0]] = kv[1]
                        all_files.append(filedict)
            except:
                logging.info(format_exc(sys.exc_info()))
                logging.info('No single end files submitted')

            try:
                for r in ref:
                    for wordpath in r:
                        filedict = {'type': 'reference', 'files': []}
                        # Parse user directories
                        try:
                            path, word = wordpath.rsplit('/', 1)
                            path += '/'
                        except:
                            word = wordpath
                            path = ''

                        if is_filename(word):
                            baseword = os.path.basename(word)
                            dl = self.download(url, user, token,
                                               ids[files.index(baseword)],
                                               filepath)
                            if shock.parse_handle(
                                    dl):  #Shock handle, get real data
                                logging.info(
                                    'Found shock handle, getting real data...')
                                s_addr, s_id = shock.parse_handle(dl)
                                s_url = 'http://{}'.format(s_addr)
                                real_file = self.download(
                                    s_url, user, token, s_id, filepath)
                                filedict['files'].append(real_file)
                            else:
                                filedict['files'].append(dl)
                        elif re.search('=', word):
                            kv = word.split('=')
                            filedict[kv[0]] = kv[1]
                        all_files.append(filedict)
            except:
                #logging.info(format_exc(sys.exc_info()))
                logging.info('No single end files submitted')

        print all_files
        return datapath, all_files

    def compute(self, body):
        error = False
        params = json.loads(body)
        job_id = params['job_id']
        uid = params['_id']
        user = params['ARASTUSER']
        token = params['oauth_token']
        pipelines = params['pipeline']

        #support legacy arast client
        if len(pipelines) > 0:
            if type(pipelines[0]) is not list:
                pipelines = [pipelines]

        ### Download files (if necessary)
        datapath, all_files = self.get_data(body)
        rawpath = datapath + '/raw/'
        jobpath = os.path.join(datapath, str(job_id))
        try:
            os.makedirs(jobpath)
        except:
            raise Exception('Data Error')

        ### Create job log
        self.out_report_name = '{}/{}_report.txt'.format(jobpath, str(job_id))
        self.out_report = open(self.out_report_name, 'w')

        ### Create data to pass to pipeline
        reads = []
        reference = []
        for fileset in all_files:
            if len(fileset['files']) != 0:
                if (fileset['type'] == 'single'
                        or fileset['type'] == 'paired'):
                    reads.append(fileset)
                elif fileset['type'] == 'reference':
                    reference.append(fileset)
                else:
                    raise Exception('fileset error')

        job_data = ArastJob({
            'job_id': params['job_id'],
            'uid': params['_id'],
            'user': params['ARASTUSER'],
            'reads': reads,
            'reference': reference,
            'initial_reads': list(reads),
            'raw_reads': copy.deepcopy(reads),
            'processed_reads': list(reads),
            'pipeline_data': {},
            'datapath': datapath,
            'out_report': self.out_report,
            'logfiles': []
        })

        self.out_report.write("Arast Pipeline: Job {}\n".format(job_id))
        self.job_list.append(job_data)
        self.start_time = time.time()
        self.done_flag = threading.Event()
        timer_thread = UpdateTimer(self.metadata, 29, time.time(), uid,
                                   self.done_flag)
        timer_thread.start()

        download_ids = {}
        contig_ids = {}

        url = "http://%s" % (self.shockurl)
        #        url += '/node'
        try:
            include_all_data = params['all_data']
        except:
            include_all_data = False
        contigs = not include_all_data
        status = ''

        ## TODO CHANGE: default pipeline
        default_pipe = ['velvet']
        exceptions = []

        if pipelines:
            try:
                if pipelines == ['auto']:
                    pipelines = [
                        default_pipe,
                    ]
                for p in pipelines:
                    self.pmanager.validate_pipe(p)

                result_files, summary, contig_files, exceptions = self.run_pipeline(
                    pipelines, job_data, contigs_only=contigs)
                for i, f in enumerate(result_files):
                    #fname = os.path.basename(f).split('.')[0]
                    fname = str(i)
                    res = self.upload(url, user, token, f)
                    download_ids[fname] = res['data']['id']

                for c in contig_files:
                    fname = os.path.basename(c).split('.')[0]
                    res = self.upload(url, user, token, c, filetype='contigs')
                    contig_ids[fname] = res['data']['id']

                # Check if job completed with no errors
                if exceptions:
                    status = 'Complete with errors'
                elif not summary:
                    status = 'Complete: No valid contigs'
                else:
                    status += "Complete"
                self.out_report.write("Pipeline completed successfully\n")
            except:
                traceback = format_exc(sys.exc_info())
                status = "[FAIL] {}".format(sys.exc_info()[1])
                print traceback
                self.out_report.write("ERROR TRACE:\n{}\n".format(
                    format_tb(sys.exc_info()[2])))

        # Format report
        for i, job in enumerate(self.job_list):
            if job['user'] == job_data['user'] and job['job_id'] == job_data[
                    'job_id']:
                self.job_list.pop(i)
        self.done_flag.set()
        new_report = open('{}.tmp'.format(self.out_report_name), 'w')

        ### Log exceptions
        if len(exceptions) > 0:
            new_report.write('PIPELINE ERRORS')
            for i, e in enumerate(exceptions):
                new_report.write('{}: {}\n'.format(i, e))
        try:
            for sum in summary:
                with open(sum) as s:
                    new_report.write(s.read())
        except:
            new_report.write('No Summary File Generated!\n\n\n')
        self.out_report.close()
        with open(self.out_report_name) as old:
            new_report.write(old.read())
        new_report.close()
        os.remove(self.out_report_name)
        shutil.move(new_report.name, self.out_report_name)
        res = self.upload(url, user, token, self.out_report_name)
        download_ids['report'] = res['data']['id']

        # Get location
        self.metadata.update_job(uid, 'result_data', download_ids)
        self.metadata.update_job(uid, 'contig_ids', contig_ids)
        self.metadata.update_job(uid, 'status', status)

        print '=========== JOB COMPLETE ============'

    def update_time_record(self):
        elapsed_time = time.time() - self.start_time
        ftime = str(datetime.timedelta(seconds=int(elapsed_time)))
        self.metadata.update_job(uid, 'computation_time', ftime)

    def run_pipeline(self, pipes, job_data, contigs_only=True):
        """
        Runs all pipelines in list PIPES
        """
        all_pipes = []
        for p in pipes:
            all_pipes += self.pmanager.parse_input(p)
        logging.info('{} pipelines:'.format(len(all_pipes)))
        for p in all_pipes:
            print '->'.join(p)
        #include_reads = self.pmanager.output_type(pipeline[-1]) == 'reads'
        include_reads = False
        pipeline_num = 1
        all_files = []
        pipe_outputs = []
        logfiles = []
        ale_reports = {}
        final_contigs = []
        final_scaffolds = []
        output_types = []
        exceptions = []
        num_pipes = len(all_pipes)
        for pipe in all_pipes:
            try:
                #job_data = copy.deepcopy(job_data_global)
                #job_data['out_report'] = job_data_global['out_report']
                pipeline, overrides = self.pmanager.parse_pipe(pipe)
                job_data.add_pipeline(pipeline_num, pipeline)
                num_stages = len(pipeline)
                pipeline_stage = 1
                pipeline_results = []
                cur_outputs = []

                # Reset job data
                job_data['reads'] = copy.deepcopy(job_data['raw_reads'])
                job_data['processed_reads'] = []
                print job_data

                self.out_report.write('\n{0} Pipeline {1}: {2} {0}\n'.format(
                    '=' * 15, pipeline_num, pipe))
                pipe_suffix = ''  # filename code for indiv pipes
                pipe_start_time = time.time()
                pipe_alive = True

                # Store data record for pipeline

                for module_name in pipeline:
                    if not pipe_alive:
                        self.out_report.write(
                            '\n{0} Module Failure, Killing Pipe {0}'.format(
                                'X' * 10))
                        break
                    module_code = ''  # unique code for data reuse
                    print '\n\n{0} Running module: {1} {2}'.format(
                        '=' * 20, module_name, '=' * (35 - len(module_name)))
                    self.garbage_collect(self.datapath, job_data['user'],
                                         2147483648)  # 2GB

                    ## PROGRESS CALCULATION
                    pipes_complete = (pipeline_num - 1) / float(num_pipes)
                    stage_complete = (pipeline_stage - 1) / float(num_stages)
                    pct_segment = 1.0 / num_pipes
                    stage_complete *= pct_segment
                    total_complete = pipes_complete + stage_complete
                    cur_state = 'Running:[{}%|P:{}/{}|S:{}/{}|{}]'.format(
                        int(total_complete * 100), pipeline_num, num_pipes,
                        pipeline_stage, num_stages, module_name)
                    self.metadata.update_job(job_data['uid'], 'status',
                                             cur_state)

                    ## LOG REPORT For now, module code is 1st and last letter
                    short_name = self.pmanager.get_short_name(module_name)
                    if short_name:
                        #pipe_suffix += short_name.capitalize()
                        module_code += short_name.capitalize()
                    else:
                        #pipe_suffix += module_name[0].upper() + module_name[-1]
                        module_code += module_name[0].upper() + module_name[-1]
                    mod_overrides = overrides[pipeline_stage - 1]
                    for k in mod_overrides.keys():
                        #pipe_suffix += '_{}{}'.format(k[0], par[k])
                        module_code += '_{}{}'.format(k[0], mod_overrides[k])
                    pipe_suffix += module_code
                    self.out_report.write(
                        'PIPELINE {} -- STAGE {}: {}\n'.format(
                            pipeline_num, pipeline_stage, module_name))
                    logging.debug('New job_data for stage {}: {}'.format(
                        pipeline_stage, job_data))
                    job_data['params'] = overrides[pipeline_stage - 1].items()
                    module_start_time = time.time()
                    ## RUN MODULE
                    # Check if output data exists
                    reuse_data = False
                    enable_reuse = True  # KILL SWITCH
                    if enable_reuse:
                        for k, pipe in enumerate(pipe_outputs):
                            if reuse_data:
                                break
                            if not pipe:
                                continue
                            # Check that all previous pipes match
                            for i in range(pipeline_stage):
                                try:
                                    if not pipe[i][0] == cur_outputs[i][0]:
                                        break
                                except:
                                    pass
                                try:
                                    if (pipe[i][0] == module_code
                                            and i == pipeline_stage - 1):
                                        #and overrides[i].items() == job_data['params']): #copy!
                                        print(
                                            'Found previously computed data, reusing {}.'
                                            .format(module_code))
                                        output = [] + pipe[i][1]
                                        pfix = (k + 1, i + 1)
                                        alldata = [] + pipe[i][2]
                                        reuse_data = True
                                        job_data.get_pipeline(
                                            pipeline_num).get_module(
                                                pipeline_stage
                                            )['elapsed_time'] = time.time(
                                                job_data.get_pipeline(i).
                                                get_module(pipeline_stage)
                                                ['elapsed_time'])

                                        break
                                except:  # Previous pipes may be shorter
                                    pass

                    output_type = self.pmanager.output_type(module_name)

                    if not reuse_data:
                        output, alldata, mod_log = self.pmanager.run_module(
                            module_name,
                            job_data,
                            all_data=True,
                            reads=include_reads)

                        ##### Module produced no output, attach log and proceed to next #####
                        if not output:
                            pipe_alive = False
                            try:
                                print mod_log
                                logfiles.append(mod_log)
                            except:
                                print 'error attaching ', mod_log
                            break

                        ##### Prefix outfiles with pipe stage (only assembler modules) #####
                        alldata = [
                            asm.prefix_file_move(
                                file,
                                "P{}_S{}_{}".format(pipeline_num,
                                                    pipeline_stage,
                                                    module_name))
                            for file in alldata
                        ]
                        module_elapsed_time = time.time() - module_start_time
                        job_data.get_pipeline(pipeline_num).get_module(
                            pipeline_stage
                        )['elapsed_time'] = module_elapsed_time

                        if alldata:  #If log was renamed
                            mod_log = asm.prefix_file(
                                mod_log,
                                "P{}_S{}_{}".format(pipeline_num,
                                                    pipeline_stage,
                                                    module_name))

                    if output_type == 'contigs' or output_type == 'scaffolds':  #Assume assembly contigs
                        if reuse_data:
                            p_num, p_stage = pfix
                        else:
                            p_num, p_stage = pipeline_num, pipeline_stage

                        # If plugin returned scaffolds
                        if type(output) is tuple and len(output) == 2:
                            out_contigs = output[0]
                            out_scaffolds = output[1]
                            cur_scaffolds = [
                                asm.prefix_file(
                                    file, "P{}_S{}_{}".format(
                                        p_num, p_stage, module_name))
                                for file in out_scaffolds
                            ]
                        else:
                            out_contigs = output
                        cur_contigs = [
                            asm.prefix_file(
                                file,
                                "P{}_S{}_{}".format(p_num, p_stage,
                                                    module_name))
                            for file in out_contigs
                        ]

                        #job_data['reads'] = asm.arast_reads(alldata)
                        job_data['contigs'] = cur_contigs

                    elif output_type == 'reads':  #Assume preprocessing
                        if include_reads and reuse_data:  # data was prefixed and moved
                            for d in output:
                                files = [
                                    asm.prefix_file(
                                        f, "P{}_S{}_{}".format(
                                            pipeline_num, pipeline_stage,
                                            module_name)) for f in d['files']
                                ]
                                d['files'] = files
                                d['short_reads'] = [] + files
                        job_data['reads'] = output
                        job_data['processed_reads'] = list(job_data['reads'])

                    else:  # Generic return, don't use in further stages
                        pipeline_results += output
                        logging.info(
                            'Generic plugin output: {}'.format(output))

                    if pipeline_stage == num_stages:  # Last stage, add contig for assessment
                        if output and (output_type == 'contigs'
                                       or output_type == 'scaffolds'
                                       ):  #If a contig was produced
                            fcontigs = cur_contigs
                            rcontigs = [
                                asm.rename_file_symlink(
                                    f, 'P{}_{}'.format(pipeline_num,
                                                       pipe_suffix))
                                for f in fcontigs
                            ]
                            try:
                                rscaffolds = [
                                    asm.rename_file_symlink(
                                        f, 'P{}_{}_{}'.format(
                                            pipeline_num, pipe_suffix,
                                            'scaff')) for f in cur_scaffolds
                                ]
                                if rscaffolds:
                                    scaffold_data = {
                                        'files': rscaffolds,
                                        'name': pipe_suffix
                                    }
                                    final_scaffolds.append(scaffold_data)
                                    output_types.append(output_type)
                            except:
                                pass
                            if rcontigs:
                                contig_data = {
                                    'files': rcontigs,
                                    'name': pipe_suffix,
                                    'alignment_bam': []
                                }
                                final_contigs.append(contig_data)
                                output_types.append(output_type)
                    try:
                        logfiles.append(mod_log)
                    except:
                        print 'error attaching ', mod_log
                    pipeline_stage += 1
                    cur_contigs = []
                    cur_scaffolds = []

                    cur_outputs.append([module_code, output, alldata])
                pipe_elapsed_time = time.time() - pipe_start_time
                pipe_ftime = str(
                    datetime.timedelta(seconds=int(pipe_elapsed_time)))
                job_data.get_pipeline(
                    pipeline_num)['elapsed_time'] = pipe_elapsed_time

                if not output:
                    self.out_report.write(
                        'ERROR: No contigs produced. See module log\n')
                else:

                    ## Assessment
                    #self.pmanager.run_module('reapr', job_data)
                    #print job_data
                    # TODO reapr break may be diff from final reapr align!
                    # ale_out, _, _ = self.pmanager.run_module('ale', job_data)
                    # if ale_out:
                    #     job_data.get_pipeline(pipeline_num).import_ale(ale_out)
                    #     ale_reports[pipe_suffix] = ale_out
                    pipeline_datapath = '{}/{}/pipeline{}/'.format(
                        job_data['datapath'], job_data['job_id'], pipeline_num)
                    try:
                        os.makedirs(pipeline_datapath)
                    except:
                        logging.info("{} exists, skipping mkdir".format(
                            pipeline_datapath))

                    # all_files.append(asm.tar_list(pipeline_datapath, pipeline_results,
                    #                     'pipe{}_{}.tar.gz'.format(pipeline_num, pipe_suffix)))

                    all_files += pipeline_results

                self.out_report.write('Pipeline {} total time: {}\n\n'.format(
                    pipeline_num, pipe_ftime))
                job_data.get_pipeline(pipeline_num)['name'] = pipe_suffix
                pipe_outputs.append(cur_outputs)
                pipeline_num += 1

            except:
                print "ERROR: Pipeline #{} Failed".format(pipeline_num)
                print format_exc(sys.exc_info())
                e = str(sys.exc_info()[1])
                if e.find('Terminated') != -1:
                    raise Exception(e)
                exceptions.append(module_name + ':\n' + str(sys.exc_info()[1]))
                pipeline_num += 1

        ## ANALYSIS: Quast
        job_data['final_contigs'] = final_contigs
        job_data['final_scaffolds'] = final_scaffolds
        job_data['params'] = []  #clear overrides from last stage

        summary = []  # Quast reports for contigs and scaffolds
        try:  #Try to assess, otherwise report pipeline errors
            if job_data['final_contigs']:
                job_data['contig_type'] = 'contigs'
                quast_report, quast_tar, z1, q_log = self.pmanager.run_module(
                    'quast', job_data, tar=True, meta=True)
                if quast_report:
                    summary.append(quast_report[0])
                with open(q_log) as infile:
                    self.out_report.write(infile.read())
            else:
                quast_report, quast_tar = '', ''

            if job_data['final_scaffolds']:
                scaff_data = dict(job_data)
                scaff_data['final_contigs'] = job_data['final_scaffolds']
                scaff_data['contig_type'] = 'scaffolds'
                scaff_report, scaff_tar, _, scaff_log = self.pmanager.run_module(
                    'quast', scaff_data, tar=True, meta=True)
                scaffold_quast = True
                if scaff_report:
                    summary.append(scaff_report[0])
                with open(scaff_log) as infile:
                    self.out_report.write('\n Quast Report - Scaffold Mode \n')
                    self.out_report.write(infile.read())
            else:
                scaffold_quast = False
        except:
            if exceptions:
                if len(exceptions) > 1:
                    raise Exception('Multiple Errors')
                else:
                    raise Exception(exceptions[0])
            else:
                raise Exception(str(sys.exc_info()[1]))

        ## CONCAT MODULE LOG FILES
        self.out_report.write("\n\n{0} Begin Module Logs {0}\n".format("=" *
                                                                       10))
        for log in logfiles:
            self.out_report.write("\n\n{0} Begin Module {0}\n".format("=" *
                                                                      10))
            try:
                with open(log) as infile:
                    self.out_report.write(infile.read())
            except:
                self.out_report.write("Error writing log file")

        ## Format Returns
        ctg_analysis = quast_tar.rsplit(
            '/', 1)[0] + '/{}_ctg_qst.tar.gz'.format(job_data['job_id'])
        try:
            os.rename(quast_tar, ctg_analysis)
            return_files = [ctg_analysis]
        except:
            #summary = ''
            return_files = []

        if scaffold_quast:
            scf_analysis = scaff_tar.rsplit(
                '/', 1)[0] + '/{}_scf_qst.tar.gz'.format(job_data['job_id'])
            #summary = quast_report[0]
            os.rename(scaff_tar, scf_analysis)
            return_files.append(scf_analysis)

        contig_files = []
        for data in final_contigs + final_scaffolds:
            for f in data['files']:
                contig_files.append(os.path.realpath(f))

        return_files += all_files

        ## Deduplicate
        seen = set()
        for f in return_files:
            seen.add(f)
        return_files = [f for f in seen]

        #if exceptions:
        # if len(exceptions) > 1:
        #     raise Exception('Multiple Errors')
        # else:
        #     raise Exception(exceptions[0])

        if contig_files:
            return_files.append(
                asm.tar_list(
                    '{}/{}'.format(job_data['datapath'],
                                   job_data['job_id']), contig_files,
                    '{}_assemblies.tar.gz'.format(job_data['job_id'])))
        print "return files: {}".format(return_files)

        return return_files, summary, contig_files, exceptions

    def upload(self, url, user, token, file, filetype='default'):
        files = {}
        files["file"] = (os.path.basename(file), open(file, 'rb'))
        logging.debug("Message sent to shock on upload: %s" % files)
        sclient = shock.Shock(url, user, token)
        if filetype == 'default':
            res = sclient.upload_misc(file, 'default')
        elif filetype == 'contigs':
            res = sclient.upload_contigs(file)
        return res

    def download(self, url, user, token, node_id, outdir):
        sclient = shock.Shock(url, user, token)
        downloaded = sclient.curl_download_file(node_id, outdir=outdir)
        return extract_file(downloaded)

    def fetch_job(self):
        connection = pika.BlockingConnection(
            pika.ConnectionParameters(host=self.arasturl))
        channel = connection.channel()
        channel.basic_qos(prefetch_count=1)
        result = channel.queue_declare(queue=self.queue,
                                       exclusive=False,
                                       auto_delete=False,
                                       durable=True)

        logging.basicConfig(
            format=("%(asctime)s %s %(levelname)-8s %(message)s", proc().name))
        print proc().name, ' [*] Fetching job...'

        channel.basic_qos(prefetch_count=1)
        channel.basic_consume(self.callback, queue=self.queue)

        channel.start_consuming()

    def callback(self, ch, method, properties, body):
        print " [*] %r:%r" % (method.routing_key, body)
        params = json.loads(body)
        job_doc = self.metadata.get_job(params['ARASTUSER'], params['job_id'])
        uid = job_doc['_id']
        ## Check if job was not killed
        if job_doc['status'] == 'Terminated':
            print 'Job {} was killed, skipping'.format(params['job_id'])
        else:
            try:
                self.compute(body)
            except:
                print sys.exc_info()
                status = "[FAIL] {}".format(format_tb(sys.exc_info()[2]))
                print logging.error(status)
                self.metadata.update_job(uid, 'status', status)
        ch.basic_ack(delivery_tag=method.delivery_tag)

    def start(self):
        self.fetch_job()