Python Client примеры использования

Язык программирования: Python

Пространство имен/Пакет: shock

Класс/Тип: Client

Примеров на hotexamples.com: 15

Python Client - 15 примеров найдено. Это лучшие примеры Python кода для shock.Client, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

create_node(4)

delete_node(3)

download_to_path(3)

query_node(2)

delete_acl(1)

get_acl(1)

get_node(1)

Пример #1

Показать файл

Файл: Helpers.py Проект: kbase/cbd

def extract_seq(args):
    # Download the file from Shock to the working directory.
    if args['nodeId'] is not None:
        shockClient = ShockClient(args['shockUrl'], args['auth'])
        shockClient.download_to_path(args['nodeId'], args['sourceFile'])

    # Extract the sequences from the source file.
    numReads = 0
    with open(args['destFile'], 'w') as f:
        if args['sequenceLen'] > 0: # A length to trim to was specified
            for seqRecord in SeqIO.parse(args['sourceFile'], args['format']):
                seq = str(seqRecord.seq)
                if len(seq) < args['sequenceLen']:
                    continue
                if len(seq) > args['sequenceLen']:
                    seq = seq[:args['sequenceLen']]
                f.write(str(seq) + '\n')
                numReads += 1
                if numReads == args['maxReads']:
                    break
        elif args['maxReads'] > 0:
            for seqRecord in SeqIO.parse(args['sourceFile'], args['format']):
                f.write(str(seqRecord.seq) + '\n')
                numReads += 1
                if numReads == args['maxReads']:
                    break
        else:
            for seqRecord in SeqIO.parse(args['sourceFile'], args['format']):
                f.write(str(seqRecord.seq) + '\n')

    # Delete the file if it does not have enough reads.
    if args['minReads'] > 0 and numReads < args['minReads']:
        os.remove(args['destFile'])
    return 0

Пример #2

Показать файл

def extract_seq(args):
    # Download the file from Shock to the working directory.
    if args['nodeId'] is not None:
        shockClient = ShockClient(args['shockUrl'], args['auth'])
        shockClient.download_to_path(args['nodeId'], args['sourceFile'])

    # Extract the sequences from the source file.
    numReads = 0
    with open(args['destFile'], 'w') as f:
        if args['sequenceLen'] > 0:  # A length to trim to was specified
            for seqRecord in SeqIO.parse(args['sourceFile'], args['format']):
                seq = str(seqRecord.seq)
                if len(seq) < args['sequenceLen']:
                    continue
                if len(seq) > args['sequenceLen']:
                    seq = seq[:args['sequenceLen']]
                f.write(str(seq) + '\n')
                numReads += 1
                if numReads == args['maxReads']:
                    break
        elif args['maxReads'] > 0:
            for seqRecord in SeqIO.parse(args['sourceFile'], args['format']):
                f.write(str(seqRecord.seq) + '\n')
                numReads += 1
                if numReads == args['maxReads']:
                    break
        else:
            for seqRecord in SeqIO.parse(args['sourceFile'], args['format']):
                f.write(str(seqRecord.seq) + '\n')

    # Delete the file if it does not have enough reads.
    if args['minReads'] > 0 and numReads < args['minReads']:
        os.remove(args['destFile'])
    return 0

Пример #3

Показать файл

Файл: DataParser.py Проект: kbase/probabilistic_annotation

    def loadDatabaseFiles(self, mylog):
        ''' Load the static database files from Shock.

            The static database files are stored in the directory specified by the
            data_folder_path configuration variable.  A file is only downloaded if
            the file is not available on this system or the file has been updated
            in Shock.

            @param mylog Log object for messages
            @return Nothing
            @raise MissingFileError when database file is not found in Shock
        '''

        # Get the current info about the static database files from the cache file.
        cacheFilename = self.StatusFiles['cache_file']
        if os.path.exists(cacheFilename):
            fileCache = json.load(open(cacheFilename, "r"))
        else:
            fileCache = dict()

        # Create a shock client.
        shockClient = ShockClient(self.shockURL)

        # See if the static database files on this system are up-to-date with files stored in Shock.
        shockFiles = dict(self.DataFiles.items() + self.SearchFiles.items())
        for key in shockFiles:
            # Get info about the file stored in Shock.
            localPath = shockFiles[key]
            name = os.path.basename(localPath)
            nodelist = shockClient.query_node(
                {'lookupname': 'ProbAnnoData/' + name})
            if len(nodelist) == 0:
                message = "Database file %s is not available from %s\n" % (
                    name, self.shockURL)
                mylog.log_message(log.ERR, message)  # MBM
                raise MissingFileError(message)
            node = nodelist[0]

            # Download the file if the checksum does not match or the file is not available on this system.
            download = False
            if key in fileCache:
                if node['file']['checksum']['md5'] != fileCache[key]['file'][
                        'checksum']['md5']:
                    download = True
            else:
                download = True
            if os.path.exists(localPath) == False:
                download = True
            if download:
                sys.stderr.write("Downloading %s to %s\n" % (key, localPath))
                shockClient.download_to_path(node["id"], localPath)
                fileCache[key] = node
                mylog.log_message(log.INFO,
                                  'Downloaded %s to %s' % (key, localPath))

        # Save the updated cache file.
        json.dump(fileCache, open(cacheFilename, "w"), indent=4)
        return

Пример #4

Показать файл

Файл: DataParser.py Проект: kbase/probabilistic_annotation

    def loadDatabaseFiles(self, mylog):
        ''' Load the static database files from Shock.

            The static database files are stored in the directory specified by the
            data_folder_path configuration variable.  A file is only downloaded if
            the file is not available on this system or the file has been updated
            in Shock.

            @param mylog Log object for messages
            @return Nothing
            @raise MissingFileError when database file is not found in Shock
        '''
        
        # Get the current info about the static database files from the cache file.
        cacheFilename = self.StatusFiles['cache_file']
        if os.path.exists(cacheFilename):
            fileCache = json.load(open(cacheFilename, "r"))
        else:
            fileCache = dict()
        
        # Create a shock client.
        shockClient = ShockClient(self.shockURL)

        # See if the static database files on this system are up-to-date with files stored in Shock.
        shockFiles = dict(self.DataFiles.items() + self.SearchFiles.items())
        for key in shockFiles:
            # Get info about the file stored in Shock.
            localPath = shockFiles[key]
            name = os.path.basename(localPath)
            nodelist = shockClient.query_node( { 'lookupname': 'ProbAnnoData/'+name } )
            if len(nodelist) == 0:
                message = "Database file %s is not available from %s\n" %(name, self.shockURL)
                mylog.log_message(log.ERR, message) # MBM
                raise MissingFileError(message)
            node = nodelist[0]
            
            # Download the file if the checksum does not match or the file is not available on this system.
            download = False
            if key in fileCache:
                if node['file']['checksum']['md5'] != fileCache[key]['file']['checksum']['md5']:
                    download = True
            else:
                download = True
            if os.path.exists(localPath) == False:
                download = True
            if download:
                sys.stderr.write("Downloading %s to %s\n" %(key, localPath))
                shockClient.download_to_path(node["id"], localPath)
                fileCache[key] = node
                mylog.log_message(log.INFO, 'Downloaded %s to %s' %(key, localPath))
                
        # Save the updated cache file.
        json.dump(fileCache, open(cacheFilename, "w"), indent=4)
        return

Пример #5

Показать файл

    def test_buildmatrix(self):
        ''' Run build_matrix() with four simple sequence files and verify the returned distance matrix.'''

        # Create a client.
        cbdClient = CompressionBasedDistance(self._config['cbd_url'], user_id=self._config['test_user'], password=self._config['test_pwd'])
        token = cbdClient._headers['AUTHORIZATION']
        
        # Create the input parameters.
        input = dict()
        input['format'] = 'fasta'
        input['scale'] = 'std'
        input['sequence_length'] = 0
        input['min_reads'] = 0
        input['max_reads'] = 0
        input['extreme'] = 1
        input['node_ids'] = list()

        # Upload the files to Shock.
        shockClient = ShockClient(self._config['shock_url'], token)
        for filename in InputFiles:
            node = shockClient.create_node(filename, '')
            input['node_ids'].append(node['id'])
        
        # Run the buildmatrix() function to generate a distance matrix.
        jobid = cbdClient.build_matrix(input)

        # Wait for the distance matrix to be built.
        time.sleep(30)

        # Get the distance matrix and save to a file.
        outputPath = 'client-tests/unittest.csv'
        args = [ os.path.join(os.environ['KB_TOP'], 'bin/cbd-getmatrix'), jobid,  outputPath ]
        proc = subprocess.Popen(args, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
        (so, se) = proc.communicate()
        if proc.returncode != 0:
            print so
            print se
        self.assertEqual(proc.returncode, 0)
        
        # Confirm the returned distance matrix matches the saved valid output.
        vf = open('client-tests/output.csv', 'r')
        tf = open(outputPath, 'r')
        for vline in vf: 
            tline = tf.readline()
            self.assertEqual(vline, tline)
        self.assertEqual(tf.readline(), '')
        vf.close()
        tf.close()
        os.remove(outputPath)

Пример #6

Показать файл

Файл: ProbAnnotationParser.py Проект: ModelSEED/PATRICClient

    def storeDatabaseFiles(self, token):
        ''' Store the static database files to Shock.

            @param token: Authorization token for authenticating to shock
            @return Nothing
        '''
        
        # Create a shock client.
        shockClient = ShockClient(self.shockURL, token=token)
        
        # Upload all of the static database files to shock.
        fileCache = dict()
        shockFiles = dict(self.DataFiles.items() + self.SearchFiles.items())
        for key in shockFiles:
            localPath = shockFiles[key]
            name = os.path.basename(localPath)
            if os.path.exists(localPath):
                sys.stderr.write('Saving "%s"...' %(localPath))
                
                # See if the file already exists in Shock.
                query = { 'lookupname': 'ProbAnnoData/'+name }
                nodelist = shockClient.query_node(query)
                
                # Remove all instances of the file in Shock.
                if nodelist != None:
                    for node in nodelist:
                        shockClient.delete_node(node['id'])
     
                # Build the attributes for this file and store as json in a separate file.
                moddate = time.ctime(os.path.getmtime(localPath))           
                attr = { 'lookupname': 'ProbAnnoData/'+name, 'moddate': moddate }
                attrFilename = os.path.join(self.dataFolderPath, name+'.attr')
                attrFid = open(attrFilename, 'w')
                json.dump(attr, attrFid, indent=4)
                attrFid.close()
                
                # Upload the file to Shock.
                metadata = shockClient.create_node(localPath, attrFilename)
                fileCache[key] = metadata
                os.remove(attrFilename)
                
                # Remove the list of users from the read ACL to give the file public read permission.
                # Note this needs to change for Shock version 0.9.5 but not sure how to set public ACLs.
                readacl = shockClient.get_acl(metadata['id'])
                shockClient.delete_acl(metadata['id'], 'read', readacl['read'][0])
                sys.stderr.write('done\n')
                
            else:
                sys.stderr.write('Could not find "%s" so it was not saved\n' %(localPath))
                
        # Save the metadata on all of the database files.
        cacheFilename = os.path.join(self.dataFolderPath, StatusFiles['cache_file'])
        json.dump(fileCache, open(cacheFilename, 'w'), indent=4)

        return

Пример #7

Показать файл

Файл: ProbAnnotationParser.py Проект: samseaver/ProbModelSEED

    def storeDatabaseFiles(self, token):
        ''' Store the static database files to Shock.
            @param token: Authorization token for authenticating to shock
            @return Nothing
        '''

        # Create a shock client.
        shockClient = ShockClient(self.shockURL, token=token)

        # Upload all of the static database files to shock.
        fileCache = dict()
        shockFiles = dict(self.DataFiles.items() + self.SearchFiles.items())
        for key in shockFiles:
            localPath = shockFiles[key]
            name = os.path.basename(localPath)
            if os.path.exists(localPath):
                sys.stderr.write('Saving "%s"...' % (localPath))

                # See if the file already exists in Shock.
                query = {'lookupname': LOOKUP_NAME_PREFIX + '/' + name}
                nodelist = shockClient.query_node(query)

                # Remove all instances of the file in Shock.
                if nodelist != None:
                    for node in nodelist:
                        shockClient.delete_node(node['id'])

                # Build the attributes for this file and store as json in a separate file.
                moddate = time.ctime(os.path.getmtime(localPath))
                attr = {
                    'lookupname': LOOKUP_NAME_PREFIX + '/' + name,
                    'moddate': moddate
                }
                attrFilename = os.path.join(self.dataFolderPath,
                                            name + '.attr')
                attrFid = open(attrFilename, 'w')
                json.dump(attr, attrFid, indent=4)
                attrFid.close()

                # Upload the file to Shock.
                metadata = shockClient.create_node(localPath, attrFilename)
                fileCache[key] = metadata
                os.remove(attrFilename)

                # Remove the list of users from the read ACL to give the file public read permission.
                # Note this needs to change for Shock version 0.9.5 but not sure how to set public ACLs.
                readacl = shockClient.get_acl(metadata['id'])
                shockClient.delete_acl(metadata['id'], 'read',
                                       readacl['read'][0])
                sys.stderr.write('done\n')

            else:
                sys.stderr.write('Could not find "%s" so it was not saved\n' %
                                 (localPath))

        # Save the metadata on all of the database files.
        cacheFilename = os.path.join(self.dataFolderPath,
                                     self.StatusFiles['cache_file'])
        json.dump(fileCache, open(cacheFilename, 'w'), indent=4)

        return

Пример #8

Показать файл

Файл: cbd-buildmatrix.py Проект: kbaseattic/cbd

    input['sequence_length'] = args.sequenceLen
    input['min_reads'] = args.minReads
    input['max_reads'] = args.maxReads
    if args.extreme:
        input['extreme'] = 1
    else:
        input['extreme'] = 0
    input['node_ids'] = list()

    # Create a cbd client (which must be authenticated).
    if args.url is None:
        args.url = get_url()
    cbdClient = CompressionBasedDistance(url=args.url)

    # Create a shock client.
    shockClient = ShockClient(args.shockurl,
                              cbdClient._headers['AUTHORIZATION'])

    # Parse the input file with the list of sequence files.
    (fileList, extensions, numMissingFiles) = parse_input_file(args.inputPath)
    if numMissingFiles > 0:
        exit(1)

    # Set the format based on the sequence file extension if the format argument was not specified.
    if args.format is None:
        if len(extensions) == 1:
            input['format'] = extensions.keys()[0]
        else:
            print "The format of the sequence files could not be determined.  Set the format with the --format argument."
            exit(1)
    else:
        input['format'] = args.format

Пример #9

Показать файл

Файл: Worker.py Проект: kbaseattic/cbd

class CompressionBasedDistance:
    ''' Calculate the compression based distance metric and save distance matrix to a file.

        @param fileList List of paths to compressed files
        @param scale Scale of distance values, 'std' for 0 to 1, 'inf' for 0 to infinity
        @param outputFile Path to file with output distance matrix
        @return Nothing
    '''
    def _cbdCalculator(self, fileList, scale, outputFile):
        # Parse the files.
        single_sizes = dict()
        pair_sizes = dict()

        for sourceFile in fileList:
            # Should strip prefix too
            fbase = os.path.basename(sourceFile)
            # This works as long as '.sorted.xz' only occurs at the end of the path.
            fname = fbase.replace('.sorted.xz', '')
            if PairSeparator in fname:
                pair_sizes[fname] = os.path.getsize(sourceFile)
            else:
                single_sizes[fname] = os.path.getsize(sourceFile)

        # Map file names to indices.
        fnames = single_sizes.keys()
        fnames.sort()
        indices = dict()

        for name, i in zip(fnames, range(len(fnames))):
            indices[name] = i

        # Compute the distance scores.
        pair_names = pair_sizes.keys()
        cbd_array = numpy.zeros((len(fnames), len(fnames)), dtype=float)
        for pair in pair_names:
            name1, name2 = pair.split(PairSeparator)
            c1 = float(single_sizes[name1])
            c2 = float(single_sizes[name2])
            c12 = float(pair_sizes[pair])
            distance = 1.0 - (2.0 * ((c1 + c2 - c12) / (c1 + c2)))
            if distance > 1.0:
                part1 = "Distance %f is greater than 1.0.  " % (distance)
                part2 = "Check sequence read lengths and relative number of sequence reads.  "
                part3 = "(c1=%f %s, c2=%f %s c12=%f %s)" % (c1, name1, c2,
                                                            name2, c12, pair)
                raise ValueError(part1 + part2 + part3)
            if scale == 'inf':
                distance = distance / (1.0 - distance)
            cbd_array[indices[name1], indices[name2]] = distance
            cbd_array[indices[name2], indices[name1]] = distance

        # Build the output file in CSV format.
        outf = open(outputFile, 'w')
        outf.write('ID,' + ','.join(fnames) + '\n')
        for i in range(len(fnames)):
            outf.write(fnames[i] + ',' +
                       ','.join(['{0:g}'.format(x)
                                 for x in cbd_array[i, :]]) + '\n')
        outf.close()
        return

    ''' Cleanup after running a job.

        @note All temporary files are removed even when there is an error.
        @return Nothing
    '''

    def _cleanup(self):
        # Delete input fasta files from Shock.
        for nodeId in self.input['node_ids']:
            try:
                self.shockClient.delete_node(nodeId)
            except Exception as e:
                self._log(
                    log.ERR, 'Error deleting node %s from Shock: %s' %
                    (nodeId, e.message))

        # Remove the work directory.
        shutil.rmtree(self.jobDirectory)

        # Stop the process pool.
        self.pool.close()
        self.pool.join()

        return

    ''' Log a message to the system log.

        @param level Message level (INFO, WARNING, etc.)
        @param message Message text
        @return Nothing
    '''

    def _log(self, level, message):
        # Create a logger if this is the first time the method has been called.
        if self.logger is None:
            submod = os.environ.get('KB_SERVICE_NAME',
                                    'CompressionBasedDistance')
            self.logger = log.log(submod,
                                  ip_address=True,
                                  authuser=True,
                                  module=True,
                                  method=True,
                                  call_id=True,
                                  config=os.getenv('KB_DEPLOYMENT_CONFIG'))

        # Log the message.
        self.logger.log_message(level, message, self.context['client_ip'],
                                self.context['user_id'],
                                self.context['module'], self.context['method'],
                                self.context['call_id'])
        return

    def __init__(self):
        self.logger = None

    ''' Run a job to build a distance matrix.

        When successful the distance matrix csv file is stored in Shock.

        @param job Dictionary with configuration variables, context variables, and input variables for job
        @raise ExtractError: Error extracting sequences from input sequence file
        @raise SeqLenError: Error with lengths of sequences in input sequence file
        @raise SortError: Error sorting a raw sequence file
        @raise MergeError: Error merging a raw sequence file
        @raise CompressError: Error compressing a raw sequence file
        @raise ShockError: Error saving file to Shock
        @return Nothing
    '''

    def runJob(self, job):

        self.config = job['config']
        self.context = job['context']
        self.input = job['input']

        # Create a shock client and authenticate as the user.
        self.shockClient = ShockClient(self.config['shock_url'],
                                       self.context['token'])

        # Create a user and job state client and authenticate as the user.
        ujsClient = UserAndJobState(self.config['userandjobstate_url'],
                                    token=self.context['token'])

        # Create a process pool.
        self.pool = Pool(processes=int(self.config['num_pool_processes']))

        # Create a work directory for storing intermediate files.
        self.jobDirectory = make_job_dir(self.config['work_folder_path'],
                                         job['id'])
        self._log(
            log.INFO, 'Job ' + job['id'] + ' running with work folder ' +
            self.jobDirectory)

        # Download input fasta files from Shock and extract sequences to work directory.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'],
                                          'extracting sequence files', 1,
                                          timestamp(3600))
        except:
            pass
        resultList = []
        sequenceList = []
        for nodeId in self.input['node_ids']:
            node = self.shockClient.get_node(nodeId)
            sourceFile = os.path.join(self.jobDirectory, node['file']['name'])
            destFile = '%s.sequence' % (os.path.splitext(sourceFile)[0])
            if PairSeparator in destFile:  # Check for pair separator string in file name and replace as needed.
                destFile = destFile.replace(PairSeparator, '-')
            sequenceList.append(destFile)
            args = dict(
            )  # Needs to be scoped here so each process gets its own copy
            args['format'] = self.input['format']
            args['shockUrl'] = self.config['shock_url']
            args['auth'] = self.context['token']
            args['sequenceLen'] = self.input['sequence_length']
            args['minReads'] = self.input['min_reads']
            args['maxReads'] = self.input['max_reads']
            args['nodeId'] = nodeId
            args['sourceFile'] = sourceFile
            args['destFile'] = destFile
            result = self.pool.apply_async(extract_seq, (args, ))
            resultList.append(result)
        for result in resultList:
            if result.get() != 0:
                self._cleanup()
                raise ExtractError(
                    "Error extracting sequences from input sequence file, result: %d"
                    % (result.get()))
        for path in self.input['file_paths']:
            sourceFile = os.path.basename(path)
            destFile = '%s/%s.sequence' % (self.jobDirectory,
                                           os.path.splitext(sourceFile)[0])
            if PairSeparator in destFile:  # Check for pair separator string in file name and replace as needed.
                destFile = destFile.replace(PairSeparator, '-')
            sequenceList.append(destFile)
            args = dict(
            )  # Needs to be scoped here so each process gets its own copy
            args['format'] = self.input['format']
            args['shockUrl'] = self.config['shock_url']
            args['auth'] = self.context['token']
            args['sequenceLen'] = self.input['sequence_length']
            args['minReads'] = self.input['min_reads']
            args['maxReads'] = self.input['max_reads']
            args['nodeId'] = None
            args['sourceFile'] = path
            args['destFile'] = destFile
            result = self.pool.apply_async(extract_seq, (args, ))
            resultList.append(result)
        for result in resultList:
            try:
                result.get()
            except Exception as e:
                self._cleanup()
                raise ExtractError(
                    "Error extracting sequences from input sequence file: %s" %
                    (e.message))

        # Confirm that each file met the criteria for sequence length and number of sequences.
        filesToRemove = list()
        for index in range(len(sequenceList)):
            # See if the file did not have the minimum number of sequences.
            if not os.path.exists(sequenceList[index]):
                filesToRemove.append(index)
                continue

            # See if the file has no data.
            if os.path.getsize(sequenceList[index]) == 0:
                self._cleanup()
                raise SeqLenError("Sequence file '%s' has no sequences" %
                                  (sequenceList[index]))

        filteredList = list()
        for index in range(len(sequenceList)):
            if index not in filesToRemove:
                filteredList.append(sequenceList[index])
        if len(filteredList) < 2:
            self._cleanup()
            raise SeqLenError(
                "There are not enough sequence files that meet the sequence length or number of sequences criteria."
            )

        # Sort the sequences.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'],
                                          'sorting sequence files', 1,
                                          timestamp(3600))
        except:
            pass
        resultList = []
        sortedList = []
        for sourceFile in filteredList:
            destFile = '%s.sorted' % (os.path.splitext(sourceFile)[0])
            sortedList.append(destFile)
            args = ['/usr/bin/sort', '--output=%s' % (destFile), sourceFile]
            result = self.pool.apply_async(run_command, (args, ))
            resultList.append(result)
        for result in resultList:
            try:
                result.get()
            except CommandError as e:
                self._cleanup()
                raise SortError(
                    "Error sorting sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'"
                    % (e.message, e.cmd, e.stdout, e.stderr))

        # Create combined and sorted files.
        try:
            ujsClient.update_job_progress(
                job['id'], self.context['token'],
                'merging all pairs of sequence files', 1, timestamp(3600))
        except:
            pass
        resultList = []
        for p, q in combinations(sortedList, 2):
            pbase = os.path.basename(p)
            qbase = os.path.basename(q)
            dbase = '%s%s%s.sorted' % (os.path.splitext(pbase)[0],
                                       PairSeparator,
                                       os.path.splitext(qbase)[0])
            destFile = os.path.join(self.jobDirectory, dbase)
            sortedList.append(destFile)
            args = ['/usr/bin/sort', '-m', '--output=%s' % (destFile), p, q]
            result = self.pool.apply_async(run_command, (args, ))
            resultList.append(result)
        for result in resultList:
            try:
                result.get()
            except CommandError as e:
                self._cleanup()
                raise MergeError(
                    "Error merging sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'"
                    % (e.message, e.cmd, e.stdout, e.stderr))

        # Compress all sorted files.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'],
                                          'compressing sequence files', 1,
                                          timestamp(3600))
        except:
            pass
        resultList = []
        compressedList = []
        for sourceFile in sortedList:
            compressedList.append(sourceFile + '.xz')
            if self.input['extreme']:
                level = '-9e'
            else:
                level = '-9'
            args = ['/usr/bin/xz', '--keep', level, '--no-warn', sourceFile]
            result = self.pool.apply_async(run_command, (args, ))
            resultList.append(result)
        for result in resultList:
            try:
                result.get()
            except CommandError as e:
                self._cleanup()
                raise CompressError(
                    "Error compressing sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'"
                    % (e.message, e.cmd, e.stdout, e.stderr))

        # Calculate the distance matrix.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'],
                                          'calculating distance matrix', 1,
                                          timestamp(3600))
        except:
            pass
        csvFile = os.path.join(self.jobDirectory, '%s.csv' % (job['id']))
        self._cbdCalculator(compressedList, self.input['scale'], csvFile)

        # Store the output file in shock.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'],
                                          'storing output file in shock', 1,
                                          timestamp(3600))
        except:
            pass
        node = self.shockClient.create_node(csvFile, '')
        if not node['id']:
            # Shock let us down. Save the distance matrix in the work directory for possible recovery.
            os.rename(
                csvFile,
                '%s/%s.csv' % (self.config['work_folder_path'], job['id']))
            self._cleanup()
            raise ShockError(
                "Error saving distance matrix file to Shock. A Shock node was not created."
            )

        # Mark the job as complete.
        results = {
            'shocknodes': [node['id']],
            'shockurl': self.config['shock_url']
        }
        ujsClient.complete_job(job['id'], self.context['token'], 'done', None,
                               results)
        self._log(log.INFO, 'Job ' + job['id'] + ' completed successfully')

        # Cleanup after ourselves.
        self._cleanup()

        return

    def calculate(self, listFilePath, scale, csvFile):

        # Each line of the list file is a path to a compressed file.
        compressedList = list()
        listFile = open(listFilePath, 'r')
        for line in listFile:
            compressedList.append(line.strip())
        listFile.close()

        # Calculate the distance matrix.
        self._cbdCalculator(compressedList, scale, csvFile)
        return

Пример #10

Показать файл

Файл: Worker.py Проект: kbaseattic/cbd

    def runJob(self, job):

        self.config = job['config']
        self.context = job['context']
        self.input = job['input']

        # Create a shock client and authenticate as the user.
        self.shockClient = ShockClient(self.config['shock_url'],
                                       self.context['token'])

        # Create a user and job state client and authenticate as the user.
        ujsClient = UserAndJobState(self.config['userandjobstate_url'],
                                    token=self.context['token'])

        # Create a process pool.
        self.pool = Pool(processes=int(self.config['num_pool_processes']))

        # Create a work directory for storing intermediate files.
        self.jobDirectory = make_job_dir(self.config['work_folder_path'],
                                         job['id'])
        self._log(
            log.INFO, 'Job ' + job['id'] + ' running with work folder ' +
            self.jobDirectory)

        # Download input fasta files from Shock and extract sequences to work directory.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'],
                                          'extracting sequence files', 1,
                                          timestamp(3600))
        except:
            pass
        resultList = []
        sequenceList = []
        for nodeId in self.input['node_ids']:
            node = self.shockClient.get_node(nodeId)
            sourceFile = os.path.join(self.jobDirectory, node['file']['name'])
            destFile = '%s.sequence' % (os.path.splitext(sourceFile)[0])
            if PairSeparator in destFile:  # Check for pair separator string in file name and replace as needed.
                destFile = destFile.replace(PairSeparator, '-')
            sequenceList.append(destFile)
            args = dict(
            )  # Needs to be scoped here so each process gets its own copy
            args['format'] = self.input['format']
            args['shockUrl'] = self.config['shock_url']
            args['auth'] = self.context['token']
            args['sequenceLen'] = self.input['sequence_length']
            args['minReads'] = self.input['min_reads']
            args['maxReads'] = self.input['max_reads']
            args['nodeId'] = nodeId
            args['sourceFile'] = sourceFile
            args['destFile'] = destFile
            result = self.pool.apply_async(extract_seq, (args, ))
            resultList.append(result)
        for result in resultList:
            if result.get() != 0:
                self._cleanup()
                raise ExtractError(
                    "Error extracting sequences from input sequence file, result: %d"
                    % (result.get()))
        for path in self.input['file_paths']:
            sourceFile = os.path.basename(path)
            destFile = '%s/%s.sequence' % (self.jobDirectory,
                                           os.path.splitext(sourceFile)[0])
            if PairSeparator in destFile:  # Check for pair separator string in file name and replace as needed.
                destFile = destFile.replace(PairSeparator, '-')
            sequenceList.append(destFile)
            args = dict(
            )  # Needs to be scoped here so each process gets its own copy
            args['format'] = self.input['format']
            args['shockUrl'] = self.config['shock_url']
            args['auth'] = self.context['token']
            args['sequenceLen'] = self.input['sequence_length']
            args['minReads'] = self.input['min_reads']
            args['maxReads'] = self.input['max_reads']
            args['nodeId'] = None
            args['sourceFile'] = path
            args['destFile'] = destFile
            result = self.pool.apply_async(extract_seq, (args, ))
            resultList.append(result)
        for result in resultList:
            try:
                result.get()
            except Exception as e:
                self._cleanup()
                raise ExtractError(
                    "Error extracting sequences from input sequence file: %s" %
                    (e.message))

        # Confirm that each file met the criteria for sequence length and number of sequences.
        filesToRemove = list()
        for index in range(len(sequenceList)):
            # See if the file did not have the minimum number of sequences.
            if not os.path.exists(sequenceList[index]):
                filesToRemove.append(index)
                continue

            # See if the file has no data.
            if os.path.getsize(sequenceList[index]) == 0:
                self._cleanup()
                raise SeqLenError("Sequence file '%s' has no sequences" %
                                  (sequenceList[index]))

        filteredList = list()
        for index in range(len(sequenceList)):
            if index not in filesToRemove:
                filteredList.append(sequenceList[index])
        if len(filteredList) < 2:
            self._cleanup()
            raise SeqLenError(
                "There are not enough sequence files that meet the sequence length or number of sequences criteria."
            )

        # Sort the sequences.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'],
                                          'sorting sequence files', 1,
                                          timestamp(3600))
        except:
            pass
        resultList = []
        sortedList = []
        for sourceFile in filteredList:
            destFile = '%s.sorted' % (os.path.splitext(sourceFile)[0])
            sortedList.append(destFile)
            args = ['/usr/bin/sort', '--output=%s' % (destFile), sourceFile]
            result = self.pool.apply_async(run_command, (args, ))
            resultList.append(result)
        for result in resultList:
            try:
                result.get()
            except CommandError as e:
                self._cleanup()
                raise SortError(
                    "Error sorting sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'"
                    % (e.message, e.cmd, e.stdout, e.stderr))

        # Create combined and sorted files.
        try:
            ujsClient.update_job_progress(
                job['id'], self.context['token'],
                'merging all pairs of sequence files', 1, timestamp(3600))
        except:
            pass
        resultList = []
        for p, q in combinations(sortedList, 2):
            pbase = os.path.basename(p)
            qbase = os.path.basename(q)
            dbase = '%s%s%s.sorted' % (os.path.splitext(pbase)[0],
                                       PairSeparator,
                                       os.path.splitext(qbase)[0])
            destFile = os.path.join(self.jobDirectory, dbase)
            sortedList.append(destFile)
            args = ['/usr/bin/sort', '-m', '--output=%s' % (destFile), p, q]
            result = self.pool.apply_async(run_command, (args, ))
            resultList.append(result)
        for result in resultList:
            try:
                result.get()
            except CommandError as e:
                self._cleanup()
                raise MergeError(
                    "Error merging sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'"
                    % (e.message, e.cmd, e.stdout, e.stderr))

        # Compress all sorted files.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'],
                                          'compressing sequence files', 1,
                                          timestamp(3600))
        except:
            pass
        resultList = []
        compressedList = []
        for sourceFile in sortedList:
            compressedList.append(sourceFile + '.xz')
            if self.input['extreme']:
                level = '-9e'
            else:
                level = '-9'
            args = ['/usr/bin/xz', '--keep', level, '--no-warn', sourceFile]
            result = self.pool.apply_async(run_command, (args, ))
            resultList.append(result)
        for result in resultList:
            try:
                result.get()
            except CommandError as e:
                self._cleanup()
                raise CompressError(
                    "Error compressing sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'"
                    % (e.message, e.cmd, e.stdout, e.stderr))

        # Calculate the distance matrix.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'],
                                          'calculating distance matrix', 1,
                                          timestamp(3600))
        except:
            pass
        csvFile = os.path.join(self.jobDirectory, '%s.csv' % (job['id']))
        self._cbdCalculator(compressedList, self.input['scale'], csvFile)

        # Store the output file in shock.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'],
                                          'storing output file in shock', 1,
                                          timestamp(3600))
        except:
            pass
        node = self.shockClient.create_node(csvFile, '')
        if not node['id']:
            # Shock let us down. Save the distance matrix in the work directory for possible recovery.
            os.rename(
                csvFile,
                '%s/%s.csv' % (self.config['work_folder_path'], job['id']))
            self._cleanup()
            raise ShockError(
                "Error saving distance matrix file to Shock. A Shock node was not created."
            )

        # Mark the job as complete.
        results = {
            'shocknodes': [node['id']],
            'shockurl': self.config['shock_url']
        }
        ujsClient.complete_job(job['id'], self.context['token'], 'done', None,
                               results)
        self._log(log.INFO, 'Job ' + job['id'] + ' completed successfully')

        # Cleanup after ourselves.
        self._cleanup()

        return

Пример #11

Показать файл

Файл: Worker.py Проект: kbase/cbd

class CompressionBasedDistance:
    
    ''' Calculate the compression based distance metric and save distance matrix to a file.

        @param fileList List of paths to compressed files
        @param scale Scale of distance values, 'std' for 0 to 1, 'inf' for 0 to infinity
        @param outputFile Path to file with output distance matrix
        @return Nothing
    '''

    def _cbdCalculator(self, fileList, scale, outputFile):
        # Parse the files.
        single_sizes = dict()
        pair_sizes = dict()
        
        for sourceFile in fileList:
            # Should strip prefix too
            fbase = os.path.basename(sourceFile)
            # This works as long as '.sorted.xz' only occurs at the end of the path.
            fname = fbase.replace('.sorted.xz', '')
            if PairSeparator in fname:
                pair_sizes[fname] = os.path.getsize(sourceFile)
            else:
                single_sizes[fname] = os.path.getsize(sourceFile)

        # Map file names to indices.
        fnames = single_sizes.keys()
        fnames.sort()
        indices = dict()
        
        for name,i in zip(fnames, range(len(fnames))):
            indices[name] = i
        
        # Compute the distance scores.
        pair_names = pair_sizes.keys()
        cbd_array = numpy.zeros((len(fnames), len(fnames)), dtype=float)
        for pair in pair_names:
            name1, name2 = pair.split(PairSeparator)
            c1 = float(single_sizes[name1])
            c2 = float(single_sizes[name2])
            c12 = float(pair_sizes[pair])
            distance = 1.0 - ( 2.0 * ( (c1 + c2 - c12) / (c1 + c2) ) )
            if distance > 1.0:
                part1 = "Distance %f is greater than 1.0.  " %(distance)
                part2 = "Check sequence read lengths and relative number of sequence reads.  "
                part3 = "(c1=%f %s, c2=%f %s c12=%f %s)" %(c1, name1, c2, name2, c12, pair)
                raise ValueError(part1+part2+part3)
            if scale == 'inf':
                distance = distance/(1.0 - distance)
            cbd_array[indices[name1],indices[name2]] = distance
            cbd_array[indices[name2],indices[name1]] = distance
            
        # Build the output file in CSV format.
        outf = open(outputFile, 'w')
        outf.write('ID,' + ','.join(fnames) + '\n')
        for i in range(len(fnames)):
             outf.write(fnames[i] + ',' + ','.join(['{0:g}'.format(x) for x in cbd_array[i,:]]) + '\n')
        outf.close()
        return
    
    ''' Cleanup after running a job.

        @note All temporary files are removed even when there is an error.
        @return Nothing
    '''

    def _cleanup(self):
        # Delete input fasta files from Shock.
        for nodeId in self.input['node_ids']:
            try:
                self.shockClient.delete_node(nodeId)
            except Exception as e:
                self._log(log.ERR, 'Error deleting node %s from Shock: %s' %(nodeId, e.message))
            
        # Remove the work directory.
        shutil.rmtree(self.jobDirectory)
            
        # Stop the process pool.
        self.pool.close()
        self.pool.join()
        
        return
    
    ''' Log a message to the system log.

        @param level Message level (INFO, WARNING, etc.)
        @param message Message text
        @return Nothing
    '''

    def _log(self, level, message):
        # Create a logger if this is the first time the method has been called.
        if self.logger is None:
            submod = os.environ.get('KB_SERVICE_NAME', 'CompressionBasedDistance')
            self.logger = log.log(submod, ip_address=True, authuser=True, module=True, method=True,
                call_id=True, config=os.getenv('KB_DEPLOYMENT_CONFIG'))

        # Log the message.
        self.logger.log_message(level, message, self.context['client_ip'], self.context['user_id'], self.context['module'],
                                self.context['method'], self.context['call_id'])
        return

    def __init__(self):
        self.logger = None

    ''' Run a job to build a distance matrix.

        When successful the distance matrix csv file is stored in Shock.

        @param job Dictionary with configuration variables, context variables, and input variables for job
        @raise ExtractError: Error extracting sequences from input sequence file
        @raise SeqLenError: Error with lengths of sequences in input sequence file
        @raise SortError: Error sorting a raw sequence file
        @raise MergeError: Error merging a raw sequence file
        @raise CompressError: Error compressing a raw sequence file
        @raise ShockError: Error saving file to Shock
        @return Nothing
    '''

    def runJob(self, job):
        
        self.config = job['config']
        self.context = job['context']
        self.input = job['input']
        
        # Create a shock client and authenticate as the user.
        self.shockClient = ShockClient(self.config['shock_url'], self.context['token'])
        
        # Create a user and job state client and authenticate as the user.
        ujsClient = UserAndJobState(self.config['userandjobstate_url'], token=self.context['token'])

        # Create a process pool.
        self.pool = Pool(processes=int(self.config['num_pool_processes']))
        
        # Create a work directory for storing intermediate files.
        self.jobDirectory = make_job_dir(self.config['work_folder_path'], job['id'])
        self._log(log.INFO, 'Job '+job['id']+' running with work folder '+self.jobDirectory)

        # Download input fasta files from Shock and extract sequences to work directory.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'], 'extracting sequence files', 1, timestamp(3600))
        except:
            pass
        resultList = []
        sequenceList = []
        for nodeId in self.input['node_ids']:
            node = self.shockClient.get_node(nodeId)
            sourceFile = os.path.join(self.jobDirectory, node['file']['name'])
            destFile = '%s.sequence' %(os.path.splitext(sourceFile)[0])
            if PairSeparator in destFile: # Check for pair separator string in file name and replace as needed.
                destFile = destFile.replace(PairSeparator, '-')
            sequenceList.append(destFile)
            args = dict() # Needs to be scoped here so each process gets its own copy
            args['format'] = self.input['format']
            args['shockUrl'] = self.config['shock_url']
            args['auth'] = self.context['token']
            args['sequenceLen'] = self.input['sequence_length']
            args['minReads'] = self.input['min_reads']
            args['maxReads'] = self.input['max_reads']
            args['nodeId'] = nodeId
            args['sourceFile'] = sourceFile
            args['destFile'] = destFile
            result = self.pool.apply_async(extract_seq, (args,))
            resultList.append(result)
        for result in resultList:
            if result.get() != 0:
                self._cleanup()
                raise ExtractError("Error extracting sequences from input sequence file, result: %d" %(result.get()))
        for path in self.input['file_paths']:
            sourceFile = os.path.basename(path)
            destFile = '%s/%s.sequence' %(self.jobDirectory, os.path.splitext(sourceFile)[0])
            if PairSeparator in destFile: # Check for pair separator string in file name and replace as needed.
                destFile = destFile.replace(PairSeparator, '-')
            sequenceList.append(destFile)
            args = dict() # Needs to be scoped here so each process gets its own copy
            args['format'] = self.input['format']
            args['shockUrl'] = self.config['shock_url']
            args['auth'] = self.context['token']
            args['sequenceLen'] = self.input['sequence_length']
            args['minReads'] = self.input['min_reads']
            args['maxReads'] = self.input['max_reads']
            args['nodeId'] = None
            args['sourceFile'] = path
            args['destFile'] = destFile
            result = self.pool.apply_async(extract_seq, (args,))
            resultList.append(result)
        for result in resultList:
            try:
                result.get()
            except Exception as e:
                self._cleanup()
                raise ExtractError("Error extracting sequences from input sequence file: %s" %(e.message))

        # Confirm that each file met the criteria for sequence length and number of sequences.
        filesToRemove = list()
        for index in range(len(sequenceList)):
            # See if the file did not have the minimum number of sequences.
            if not os.path.exists(sequenceList[index]):
                filesToRemove.append(index)
                continue

            # See if the file has no data.
            if os.path.getsize(sequenceList[index]) == 0:
                self._cleanup()
                raise SeqLenError("Sequence file '%s' has no sequences" %(sequenceList[index]))

        filteredList = list()
        for index in range(len(sequenceList)):
            if index not in filesToRemove:
                filteredList.append(sequenceList[index])
        if len(filteredList) < 2:
            self._cleanup()
            raise SeqLenError("There are not enough sequence files that meet the sequence length or number of sequences criteria.")

        # Sort the sequences.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'], 'sorting sequence files', 1, timestamp(3600))
        except:
            pass
        resultList = []
        sortedList = []
        for sourceFile in filteredList:
            destFile = '%s.sorted' %(os.path.splitext(sourceFile)[0])
            sortedList.append(destFile)
            args = [ '/usr/bin/sort', '--output=%s' %(destFile), sourceFile ]
            result = self.pool.apply_async(run_command, (args,))
            resultList.append(result)
        for result in resultList:
            try:
                result.get()
            except CommandError as e:
                self._cleanup()
                raise SortError("Error sorting sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'" %(e.message, e.cmd, e.stdout, e.stderr))
             
        # Create combined and sorted files.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'], 'merging all pairs of sequence files', 1, timestamp(3600))
        except:
            pass
        resultList = []
        for p,q in combinations(sortedList, 2):
            pbase = os.path.basename(p)
            qbase = os.path.basename(q)
            dbase = '%s%s%s.sorted' %(os.path.splitext(pbase)[0], PairSeparator, os.path.splitext(qbase)[0])
            destFile = os.path.join(self.jobDirectory, dbase)
            sortedList.append(destFile)
            args = [ '/usr/bin/sort', '-m', '--output=%s' %(destFile), p, q ]
            result = self.pool.apply_async(run_command, (args,))
            resultList.append(result)
        for result in resultList:
            try:
                result.get()
            except CommandError as e:
                self._cleanup()
                raise MergeError("Error merging sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'" %(e.message, e.cmd, e.stdout, e.stderr))
                   
        # Compress all sorted files.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'], 'compressing sequence files', 1, timestamp(3600))
        except:
            pass
        resultList = []
        compressedList = []
        for sourceFile in sortedList:
            compressedList.append(sourceFile+'.xz')
            if self.input['extreme']:
                level = '-9e'
            else:
                level = '-9'
            args = [ '/usr/bin/xz', '--keep', level, '--no-warn', sourceFile ]
            result = self.pool.apply_async(run_command, (args,))
            resultList.append(result)
        for result in resultList:
            try:
                result.get()
            except CommandError as e:
                self._cleanup()
                raise CompressError("Error compressing sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'" %(e.message, e.cmd, e.stdout, e.stderr))
        
        # Calculate the distance matrix.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'], 'calculating distance matrix', 1, timestamp(3600))
        except:
            pass
        csvFile = os.path.join(self.jobDirectory, '%s.csv' %(job['id']))
        self._cbdCalculator(compressedList, self.input['scale'], csvFile)
        
        # Store the output file in shock.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'], 'storing output file in shock', 1, timestamp(3600))
        except:
            pass
        node = self.shockClient.create_node(csvFile, '')
        if not node['id']:
            # Shock let us down. Save the distance matrix in the work directory for possible recovery.
            os.rename(csvFile, '%s/%s.csv' %(self.config['work_folder_path'], job['id']))
            self._cleanup()
            raise ShockError("Error saving distance matrix file to Shock. A Shock node was not created.")
        
        # Mark the job as complete.
        results = { 'shocknodes': [ node['id'] ], 'shockurl': self.config['shock_url'] }
        ujsClient.complete_job(job['id'], self.context['token'], 'done', None, results)
        self._log(log.INFO, 'Job '+job['id']+' completed successfully')

        # Cleanup after ourselves.
        self._cleanup()
        
        return

    def calculate(self, listFilePath, scale, csvFile):

        # Each line of the list file is a path to a compressed file.
        compressedList = list()
        listFile = open(listFilePath, 'r')
        for line in listFile:
            compressedList.append(line.strip())
        listFile.close()

        # Calculate the distance matrix.
        self._cbdCalculator(compressedList, scale, csvFile)
        return

Пример #12

Показать файл

Файл: Worker.py Проект: kbase/cbd

    def runJob(self, job):
        
        self.config = job['config']
        self.context = job['context']
        self.input = job['input']
        
        # Create a shock client and authenticate as the user.
        self.shockClient = ShockClient(self.config['shock_url'], self.context['token'])
        
        # Create a user and job state client and authenticate as the user.
        ujsClient = UserAndJobState(self.config['userandjobstate_url'], token=self.context['token'])

        # Create a process pool.
        self.pool = Pool(processes=int(self.config['num_pool_processes']))
        
        # Create a work directory for storing intermediate files.
        self.jobDirectory = make_job_dir(self.config['work_folder_path'], job['id'])
        self._log(log.INFO, 'Job '+job['id']+' running with work folder '+self.jobDirectory)

        # Download input fasta files from Shock and extract sequences to work directory.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'], 'extracting sequence files', 1, timestamp(3600))
        except:
            pass
        resultList = []
        sequenceList = []
        for nodeId in self.input['node_ids']:
            node = self.shockClient.get_node(nodeId)
            sourceFile = os.path.join(self.jobDirectory, node['file']['name'])
            destFile = '%s.sequence' %(os.path.splitext(sourceFile)[0])
            if PairSeparator in destFile: # Check for pair separator string in file name and replace as needed.
                destFile = destFile.replace(PairSeparator, '-')
            sequenceList.append(destFile)
            args = dict() # Needs to be scoped here so each process gets its own copy
            args['format'] = self.input['format']
            args['shockUrl'] = self.config['shock_url']
            args['auth'] = self.context['token']
            args['sequenceLen'] = self.input['sequence_length']
            args['minReads'] = self.input['min_reads']
            args['maxReads'] = self.input['max_reads']
            args['nodeId'] = nodeId
            args['sourceFile'] = sourceFile
            args['destFile'] = destFile
            result = self.pool.apply_async(extract_seq, (args,))
            resultList.append(result)
        for result in resultList:
            if result.get() != 0:
                self._cleanup()
                raise ExtractError("Error extracting sequences from input sequence file, result: %d" %(result.get()))
        for path in self.input['file_paths']:
            sourceFile = os.path.basename(path)
            destFile = '%s/%s.sequence' %(self.jobDirectory, os.path.splitext(sourceFile)[0])
            if PairSeparator in destFile: # Check for pair separator string in file name and replace as needed.
                destFile = destFile.replace(PairSeparator, '-')
            sequenceList.append(destFile)
            args = dict() # Needs to be scoped here so each process gets its own copy
            args['format'] = self.input['format']
            args['shockUrl'] = self.config['shock_url']
            args['auth'] = self.context['token']
            args['sequenceLen'] = self.input['sequence_length']
            args['minReads'] = self.input['min_reads']
            args['maxReads'] = self.input['max_reads']
            args['nodeId'] = None
            args['sourceFile'] = path
            args['destFile'] = destFile
            result = self.pool.apply_async(extract_seq, (args,))
            resultList.append(result)
        for result in resultList:
            try:
                result.get()
            except Exception as e:
                self._cleanup()
                raise ExtractError("Error extracting sequences from input sequence file: %s" %(e.message))

        # Confirm that each file met the criteria for sequence length and number of sequences.
        filesToRemove = list()
        for index in range(len(sequenceList)):
            # See if the file did not have the minimum number of sequences.
            if not os.path.exists(sequenceList[index]):
                filesToRemove.append(index)
                continue

            # See if the file has no data.
            if os.path.getsize(sequenceList[index]) == 0:
                self._cleanup()
                raise SeqLenError("Sequence file '%s' has no sequences" %(sequenceList[index]))

        filteredList = list()
        for index in range(len(sequenceList)):
            if index not in filesToRemove:
                filteredList.append(sequenceList[index])
        if len(filteredList) < 2:
            self._cleanup()
            raise SeqLenError("There are not enough sequence files that meet the sequence length or number of sequences criteria.")

        # Sort the sequences.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'], 'sorting sequence files', 1, timestamp(3600))
        except:
            pass
        resultList = []
        sortedList = []
        for sourceFile in filteredList:
            destFile = '%s.sorted' %(os.path.splitext(sourceFile)[0])
            sortedList.append(destFile)
            args = [ '/usr/bin/sort', '--output=%s' %(destFile), sourceFile ]
            result = self.pool.apply_async(run_command, (args,))
            resultList.append(result)
        for result in resultList:
            try:
                result.get()
            except CommandError as e:
                self._cleanup()
                raise SortError("Error sorting sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'" %(e.message, e.cmd, e.stdout, e.stderr))
             
        # Create combined and sorted files.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'], 'merging all pairs of sequence files', 1, timestamp(3600))
        except:
            pass
        resultList = []
        for p,q in combinations(sortedList, 2):
            pbase = os.path.basename(p)
            qbase = os.path.basename(q)
            dbase = '%s%s%s.sorted' %(os.path.splitext(pbase)[0], PairSeparator, os.path.splitext(qbase)[0])
            destFile = os.path.join(self.jobDirectory, dbase)
            sortedList.append(destFile)
            args = [ '/usr/bin/sort', '-m', '--output=%s' %(destFile), p, q ]
            result = self.pool.apply_async(run_command, (args,))
            resultList.append(result)
        for result in resultList:
            try:
                result.get()
            except CommandError as e:
                self._cleanup()
                raise MergeError("Error merging sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'" %(e.message, e.cmd, e.stdout, e.stderr))
                   
        # Compress all sorted files.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'], 'compressing sequence files', 1, timestamp(3600))
        except:
            pass
        resultList = []
        compressedList = []
        for sourceFile in sortedList:
            compressedList.append(sourceFile+'.xz')
            if self.input['extreme']:
                level = '-9e'
            else:
                level = '-9'
            args = [ '/usr/bin/xz', '--keep', level, '--no-warn', sourceFile ]
            result = self.pool.apply_async(run_command, (args,))
            resultList.append(result)
        for result in resultList:
            try:
                result.get()
            except CommandError as e:
                self._cleanup()
                raise CompressError("Error compressing sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'" %(e.message, e.cmd, e.stdout, e.stderr))
        
        # Calculate the distance matrix.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'], 'calculating distance matrix', 1, timestamp(3600))
        except:
            pass
        csvFile = os.path.join(self.jobDirectory, '%s.csv' %(job['id']))
        self._cbdCalculator(compressedList, self.input['scale'], csvFile)
        
        # Store the output file in shock.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'], 'storing output file in shock', 1, timestamp(3600))
        except:
            pass
        node = self.shockClient.create_node(csvFile, '')
        if not node['id']:
            # Shock let us down. Save the distance matrix in the work directory for possible recovery.
            os.rename(csvFile, '%s/%s.csv' %(self.config['work_folder_path'], job['id']))
            self._cleanup()
            raise ShockError("Error saving distance matrix file to Shock. A Shock node was not created.")
        
        # Mark the job as complete.
        results = { 'shocknodes': [ node['id'] ], 'shockurl': self.config['shock_url'] }
        ujsClient.complete_job(job['id'], self.context['token'], 'done', None, results)
        self._log(log.INFO, 'Job '+job['id']+' completed successfully')

        # Cleanup after ourselves.
        self._cleanup()
        
        return

Пример #13

Показать файл

        ujsClient.delete_job(args.jobID)
        exit(1)

    # Check if the job is complete.
    if not info['complete']:
        print "Job '%s' has status '%s' and is working on task %s of %s.  Check again later." \
            %(args.jobID, info['status'], info['total_progress'], info['max_progress'])
        exit(1)

    # Show job info.
    if args.showTimes:
        print 'Job started at %s and finished at %s' % (info['started'],
                                                        info['last_update'])

    # Create a shock client.
    shockClient = ShockClient(info['results']['shockurl'],
                              ujsClient._headers['AUTHORIZATION'])

    # Download the output to the specified file and remove the file from shock.
    try:
        shockClient.download_to_path(info['results']['shocknodes'][0],
                                     args.outputPath)
    except Exception as e:
        print 'Error downloading distance matrix from %s: %s' % (
            info['results']['shockurl'], e.message)
        traceback.print_exc(file=sys.stdout)
    try:
        shockClient.delete_node(info['results']['shocknodes'][0])
    except Exception as e:
        print 'Error deleting distance matrix file from %s: ' % (
            +info['results']['shockurl'], e.message)
        traceback.print_exc(file=sys.stdout)

Пример #14

Показать файл

Файл: cbd-buildmatrix.py Проект: kbase/cbd

    input['sequence_length'] = args.sequenceLen
    input['min_reads'] = args.minReads
    input['max_reads'] = args.maxReads
    if args.extreme:
        input['extreme'] = 1
    else:
        input['extreme'] = 0
    input['node_ids'] = list()

    # Create a cbd client (which must be authenticated).
    if args.url is None:
        args.url = get_url()
    cbdClient = CompressionBasedDistance(url=args.url)
    
    # Create a shock client.
    shockClient = ShockClient(args.shockurl, cbdClient._headers['AUTHORIZATION'])
    
    # Parse the input file with the list of sequence files.
    (fileList, extensions, numMissingFiles) = parse_input_file(args.inputPath)
    if numMissingFiles > 0:
        exit(1)

    # Set the format based on the sequence file extension if the format argument was not specified.
    if args.format is None:
        if len(extensions) == 1:
            input['format'] = extensions.keys()[0]
        else:
            print "The format of the sequence files could not be determined.  Set the format with the --format argument."
            exit(1)
    else:
        input['format'] = args.format

Пример #15

Показать файл

Файл: cbd-getmatrix.py Проект: kbase/cbd

        print ujsClient.get_detailed_error(args.jobID)
        ujsClient.delete_job(args.jobID)
        exit(1)

    # Check if the job is complete.
    if not info['complete']:
        print "Job '%s' has status '%s' and is working on task %s of %s.  Check again later." \
            %(args.jobID, info['status'], info['total_progress'], info['max_progress'])
        exit(1)

    # Show job info.
    if args.showTimes:
        print 'Job started at %s and finished at %s' %(info['started'], info['last_update'])

    # Create a shock client.
    shockClient = ShockClient(info['results']['shockurl'], ujsClient._headers['AUTHORIZATION'])
       
    # Download the output to the specified file and remove the file from shock.
    try:
        shockClient.download_to_path(info['results']['shocknodes'][0], args.outputPath)
    except Exception as e:
        print 'Error downloading distance matrix from %s: %s' %(info['results']['shockurl'], e.message)
        traceback.print_exc(file=sys.stdout)
    try:
        shockClient.delete_node(info['results']['shocknodes'][0])
    except Exception as e:
        print 'Error deleting distance matrix file from %s: ' %(+info['results']['shockurl'], e.message)
        traceback.print_exc(file=sys.stdout)
    
    # Delete the job.
    ujsClient.delete_job(args.jobID)