def setUp(self): ''' Create a HitList object with some hits (all bogus) ''' self.logger = logging.root self.hitlist = HitList(self.logger) self.number_of_hits = 10000 for number in range(self.number_of_hits): sequence = _SeqId('sequence-{}'.format(number), "GCTGACTGACTG") target = _SeqId('target-{}'.format(number), "ACTGACTGACTG") alignment = '-CTGACTGACTG' # Create hit sequence_location = 0, 1 target_location = 0, 1 hit = Hit(self.logger, sequence, target, sequence_location, target_location) hit.set_alignment(alignment) hit.set_sequence_match(sequence.seq) hit.set_target_match(target.seq) self.hitlist.append(hit) #create temporary file for the output self.outputfile = tempfile.mkstemp(suffix='.out', prefix='test_formatters_')[1]
def __init__(self, logger, score, settings): ''' Constructor. @param logger: an instance of logging.logger @param score: a score object. None by default @param settings: an instance of a settings object as returned by optparse. These settings should include: filterFactor: Defaults to 0.7 device: Defaults to 0 i.e. the first card in the array #limitLength. Defaults to 5000. Sets a limit to the maximal length of items to be be compared at one time maxGenomeLength. Sets the maximum length of (the part of) the sequence that will be processed at one time. Should the length of the sequence exceed this setting, the sequence will be divided and the resulting parts will be processed separately. Defaults to 200000. @raise InvalidOptionException: If either verifySettings of verifyArguments fail. ''' logger.debug('Initializing aligner...') self.logger = logger self.score = score self.hitlist = HitList(self.logger) logger.debug('Setting SW...') self.settings = settings if (self.settings.framework.upper() == 'OPENCL'): if (self.settings.device_type.upper() == 'GPU'): self.logger.debug('Using OpenCL GPU implementation') from pyPaSWAS.Core.SmithWatermanOcl import SmithWatermanGPU self.smith_waterman = SmithWatermanGPU(self.logger, self.score, settings) elif (self.settings.device_type.upper() == 'CPU'): self.logger.debug('Using OpenCL CPU implementation') from pyPaSWAS.Core.SmithWatermanOcl import SmithWatermanCPU self.smith_waterman = SmithWatermanCPU(self.logger, self.score, settings) elif (self.settings.device_type.upper() == 'ACCELERATOR'): self.logger.debug('Using OpenCL Accelerator implementation') from pyPaSWAS.Core.SmithWatermanOcl import SmithWatermanGPU self.smith_waterman = SmithWatermanGPU(self.logger, self.score, settings) else: self.logger.debug( 'Unknown settings for device. Using default OpenCL GPU implementation' ) from pyPaSWAS.Core.SmithWatermanOcl import SmithWatermanGPU self.smith_waterman = SmithWatermanGPU(self.logger, self.score, settings) elif self.settings.framework.upper() == 'CUDA': self.logger.debug('Using CUDA implementation') from pyPaSWAS.Core.SmithWatermanCuda import SmithWatermanCuda self.smith_waterman = SmithWatermanCuda(self.logger, self.score, settings) else: self.logger.info( 'Unknown settings for framework. Using OpenCL GPU implementation as default' ) from pyPaSWAS.Core.SmithWatermanOcl import SmithWatermanGPU self.smith_waterman = SmithWatermanGPU(self.logger, self.score, settings) self.logger.debug('Aligner initialized.')
class FormatterTester(unittest.TestCase): def setUp(self): ''' Create a HitList object with some hits (all bogus) ''' self.logger = logging.root self.hitlist = HitList(self.logger) self.number_of_hits = 10000 for number in range(self.number_of_hits): sequence = _SeqId('sequence-{}'.format(number), "GCTGACTGACTG") target = _SeqId('target-{}'.format(number), "ACTGACTGACTG") alignment = '-CTGACTGACTG' # Create hit sequence_location = 0, 1 target_location = 0, 1 hit = Hit(self.logger, sequence, target, sequence_location, target_location) hit.set_alignment(alignment) hit.set_sequence_match(sequence.seq) hit.set_target_match(target.seq) self.hitlist.append(hit) #create temporary file for the output self.outputfile = tempfile.mkstemp(suffix='.out', prefix='test_formatters_')[1] def testDefaultFormatter(self): ''' Run the default formatter ''' formatter = Formatters.DefaultFormatter(self.logger, self.hitlist, self.outputfile) formatter.print_results() with open(self.outputfile) as reader: lines = reader.readlines() self.assertEqual(self.number_of_hits * 4, len(lines)) def testSamFormatter(self): '''Run the SAM output formatter ''' formatter = Formatters.SamFormatter(self.logger, self.hitlist, self.outputfile) formatter.print_results() with open(self.outputfile) as reader: lines = reader.readlines() # Not sure if the number of lines is always equal to the assertion below self.assertEqual(self.number_of_hits * 2 + 2, len(lines)) def tearDown(self): os.remove(self.outputfile)
def process(self, records_seqs, targets, pypaswas): '''This methods sends the target- and query sequences to the SmithWaterman instance and receives the resulting hitlist. ''' # Fix this target self.logger.debug('Fixing palindrome sequences...') cur_records_seq = records_seqs cur_targets = targets target_index = 0 # handle it as a queue: while len(cur_targets) > 0: self.smith_waterman.set_targets(cur_targets[:1], target_index) results = self.smith_waterman.align_sequences( cur_records_seq[:1], cur_targets, target_index) if len(results.real_hits) == 0: # nothing more to do results = HitList(self.logger) hit = Hit(self.logger, cur_records_seq[0], cur_targets[0], (0, 1), (0, 1)) results.append(hit) self.hitlist.extend(results) else: # process hit to get new targets # get hits and sort on highest score hit = sorted(results.real_hits.values(), key=attrgetter('score'), reverse=True)[0] # process this best hit self.palindrome(hit, cur_records_seq, cur_targets, self.settings) # remove processed sequences: cur_targets = cur_targets[1:] cur_records_seq = cur_records_seq[1:] self.logger.debug('Fixing done.') return self.hitlist
def run(self): '''The main program of pyPaSWAS.''' # Read command-line arguments self.settings, self.arguments = parse_cli(self.config_file) self.logger = set_logger(self.settings) self.logger.info("Initializing application...") self._set_outfile() self._set_scoring_matrix() self.logger.info('Application initialized.') self.logger.info('Setting program...') self._set_output_format() self._set_program() self.logger.info('Program set.') queriesToProcess = True query_start = int(self.settings.start_query) query_end = int(self.settings.start_query) + int( self.settings.query_step) if query_end > int(self.settings.end_query) and int( self.settings.start_query) != int(self.settings.end_query): query_end = int(self.settings.end_query) start_index = int(self.settings.start_target) end_index = int(self.settings.start_target) + int( self.settings.sequence_step) if end_index > int(self.settings.end_target) and int( self.settings.start_target) != int(self.settings.end_target): end_index = int(self.settings.end_target) results = HitList(self.logger) while queriesToProcess: self.logger.info('Reading query sequences {} {}...'.format( query_start, query_end)) try: query_sequences = self._get_query_sequences(self.arguments[0], start=query_start, end=query_end) self.logger.info('Query sequences OK.') except ReaderException: queriesToProcess = False sequencesToProcess = True if not self.settings.program == "palindrome": start_index = int(self.settings.start_target) end_index = int(self.settings.start_target) + int( self.settings.sequence_step) if end_index > int(self.settings.end_target) and int( self.settings.start_target) != int( self.settings.end_target): end_index = int(self.settings.end_target) while queriesToProcess and sequencesToProcess: self.logger.info('Reading target sequences {}, {}...'.format( start_index, end_index)) try: target_sequences = self._get_target_sequences( self.arguments[0], start=start_index, end=end_index) self.logger.info('Target sequences OK.') except ReaderException: sequencesToProcess = False if not sequencesToProcess or not queriesToProcess or len( query_sequences) == 0 or len(target_sequences) == 0: sequencesToProcess = False self.logger.info('Processing done') else: self.logger.info('Processing {0}- vs {1}-sequences'.format( len(query_sequences), len(target_sequences))) results.extend( self.program.process(query_sequences, target_sequences, self)) if sequencesToProcess and len(target_sequences) <= end_index: # for palindrome program, skip directly to next sequencesToProcess = False self.logger.info('Processing done') start_index = start_index + int(self.settings.sequence_step) end_index = end_index + int(self.settings.sequence_step) if self.settings.program == "palindrome" or ( int(self.settings.end_target) > 0 and int(self.settings.end_target) < end_index): sequencesToProcess = False if int(self.settings.end_query) > 0 and int( self.settings.end_query) < query_end: queriesToProcess = False query_start = query_start + int(self.settings.query_step) query_end = query_end + int(self.settings.query_step) nhits = len(results.hits) # retrieve and print results! self.logger.info('Processing OK ({} hits found).'.format(nhits)) if nhits > 0: self.logger.info('Formatting output...') formatter = self._get_formatter(results) self.logger.info('Formatting OK.') self.logger.info('Writing output...') formatter.print_results() self.logger.info('Writing OK.') self.logger.info('Finished') else: self.logger.warning('No suitable hits produced, exiting...')
def _print_alignments(self, sequences, targets, start_seq, start_target, hit_list=None): ''' Prints alingments :param sequences: :param targets: :param start_seq: :param start_target: :param hit_list: ''' if hit_list is None: hit_list = HitList(self.logger) self.logger.debug('Printing alignments.') starting_points = self._get_starting_point_byte_array() #starting_point = StartingPoint(self.logger) number_of_starting_points = self._get_number_of_starting_points() self.logger.debug('Number of starting points is: {0}.'.format( number_of_starting_points)) if number_of_starting_points >= (self.maximum_number_starting_points * self.number_of_sequences * self.number_targets): self.logger.warning( "Too many hits returned. Skipping the rest. Please set lower_limit_score higher in config." ) number_of_starting_points = self.maximum_number_starting_points * self.number_of_sequences * self.number_targets max_score = 0 direction_array = self._get_direction_byte_array() # self.logger.debug(direction_array) starting_points_list = [] for i in range(0, number_of_starting_points): starting_point = StartingPoint(self.logger) starting_point.parse_byte_string(starting_points, i) starting_points_list.append(starting_point) starting_points_list.sort(key=lambda s: s.score) i = 0 #while (i < number_of_starting_points): for starting_point in starting_points_list: # TODO: assign starting_point instance to a variable => create function in # StartingPoints to retrieve an item by index #starting_point.parse_byte_string(starting_points, i) i += 1 alignment_length = 0 gaps_seq = 0 gaps_target = 0 matches = 0 mismatches = 0 target = [] sequence = [] alignment = [] max_score = starting_point.score if starting_point.score > max_score else max_score target_starting_point = starting_point.target sequence_starting_point = starting_point.sequence block_x = int(starting_point.block_x) block_y = int(starting_point.block_y) value_x = int(starting_point.value_x) value_y = int(starting_point.value_y) local_index = 0 s_end = block_x * self.shared_x + value_x t_end = block_y * self.shared_y + value_y # @TO-DO: this is bugfix for the read mapping algorithm. Should not happen, so fix this where it should be fixed if start_seq + sequence_starting_point >= len( sequences) or start_target + target_starting_point >= len( targets): self.logger.debug("Starting points in hit incorrect. Skipping") continue if hasattr(sequences[start_seq + sequence_starting_point], 'start_position'): s_end += sequences[start_seq + sequence_starting_point].start_position if hasattr(targets[start_target + target_starting_point], 'start_position'): t_end += targets[start_target + target_starting_point].start_position if not hasattr(sequences[sequence_starting_point + start_seq], 'distance'): sequences[sequence_starting_point + start_seq].distance = 0.0 s_start = s_end + 1 t_start = t_end + 1 #direction = direction_array[sequence_starting_point][target_starting_point][block_x][block_y][value_x][value_y] direction = self._get_direction(direction_array, sequence_starting_point, target_starting_point, block_x, block_y, value_x, value_y) show = True # check in 'all to all' when 1 data set is used to filter out hit X vs X (filtered on identical id): if sequences[sequence_starting_point + start_seq].id == targets[target_starting_point + start_target].id: direction = STOP_DIRECTION self.logger.debug("Found same ID sequence -> target. Skipping") show = False #self.logger.debug('Score is: {0} vs {1}.'.format(starting_point.score, self.settings.minimum_score)) if starting_point.score < float(self.settings.minimum_score): show = False while (self._is_in_alignment(show, block_x, block_y, value_x, value_y, direction)): direction = self._get_direction(direction_array, sequence_starting_point, target_starting_point, block_x, block_y, value_x, value_y) self._set_direction(IN_ALIGNMENT, direction_array, sequence_starting_point, target_starting_point, block_x, block_y, value_x, value_y) alignment_length += 1 if (direction == IN_ALIGNMENT): show = False elif (direction == UPPER_LEFT_DIRECTION): target.append( targets[start_target + target_starting_point][block_y * self.shared_y + value_y]) sequence.append( sequences[start_seq + sequence_starting_point][block_x * self.shared_x + value_x]) alignment.append( SmithWaterman.MATCH_CHAR if target[local_index].lower( ) == sequence[local_index].lower( ) else SmithWaterman.MISMATCH_CHAR) s_start -= 1 t_start -= 1 matches += 1 if target[local_index].lower( ) == sequence[local_index].lower() else 0 mismatches += 1 if target[local_index].lower( ) != sequence[local_index].lower() else 0 if (value_x == 0): block_x -= 1 value_x = self.shared_x - 1 else: value_x -= 1 if (value_y == 0): block_y -= 1 value_y = self.shared_y - 1 else: value_y -= 1 elif (direction == LEFT_DIRECTION): gaps_target += 1 target.append(SmithWaterman.GAP_CHAR_SEQ) sequence.append( sequences[start_seq + sequence_starting_point][block_x * self.shared_x + value_x]) alignment.append(SmithWaterman.GAP_CHAR_ALIGN) s_start -= 1 if (value_x == 0): block_x -= 1 value_x = self.shared_x - 1 else: value_x -= 1 elif (direction == UPPER_DIRECTION): gaps_seq += 1 target.append( targets[start_target + target_starting_point][block_y * self.shared_y + value_y]) sequence.append(SmithWaterman.GAP_CHAR_SEQ) alignment.append(SmithWaterman.GAP_CHAR_ALIGN) t_start -= 1 if (value_y == 0): block_y -= 1 value_y = self.shared_y - 1 else: value_y -= 1 elif (direction == STOP_DIRECTION): # end of alignment target.append( targets[start_target + target_starting_point][block_y * self.shared_y + value_y]) sequence.append( sequences[start_seq + sequence_starting_point][block_x * self.shared_x + value_x]) alignment.append( SmithWaterman.MATCH_CHAR if target[local_index].lower( ) == sequence[local_index].lower( ) else SmithWaterman.MISMATCH_CHAR) s_start -= 1 t_start -= 1 matches += 1 if target[local_index].lower( ) == sequence[local_index].lower() else 0 mismatches += 1 if target[local_index].lower( ) != sequence[local_index].lower() else 0 else: block_x = -1 local_index += 1 if show: hit = Hit(self.logger, sequences[sequence_starting_point + start_seq], targets[target_starting_point + start_target], (s_start, s_end), (t_start, t_end)) # set the strings that contain the alignment information. Since they have been generated by a trace-BACK, they should be reversed first sequence.reverse() alignment.reverse() target.reverse() hit.set_sequence_match(''.join(sequence)) hit.set_alignment(''.join(alignment)) hit.set_target_match(''.join(target)) hit.set_scores(starting_point.score, matches, mismatches) if self._filter_hit(hit): hit_list.append(hit) else: self.logger.debug( "Hit {0} -vs- {1} does not meet filter requirements". format( sequences[sequence_starting_point + start_seq].id, targets[target_starting_point + start_target].id)) #if not show: # self.logger.debug("Hit {0} -vs- {1} not shown".format(sequences[sequence_starting_point + start_seq].id, targets[target_starting_point + start_target].id )) return hit_list
def align_sequences(self, records_seqs, targets, target_index): '''Aligns sequences against the targets. Returns the resulting alignments in a hitlist.''' # reset values for next set of sequences index = 0 prev_seq_length = 0 prev_target_length = 0 cont = True # step through all the sequences max_length = 0 if len(records_seqs) > 0: max_length = len(records_seqs[0]) hitlist = HitList(self.logger) while index < len(records_seqs) and cont: # make sure length of sequences can be divided by shared_x # don't reset when no need to recompile: if self.settings.recompile == "F": length = int( math.ceil(max_length / float(self.shared_x)) * self.shared_x) else: length = int( math.ceil( len(records_seqs[index].seq) / float(self.shared_x)) * self.shared_x) # if lengths of sequences or targets differ, reset CUDA code and memory, otherwise use current settings # this way there is no need to recompile the code for every run if ((length != prev_seq_length or self.target_block_length != prev_target_length) or (self.max_sequences + index >= len(records_seqs))) and self.settings.recompile == "T": # clear memory self._clear_memory() if (length != prev_seq_length or self.target_block_length != prev_target_length): # see how many sequences fit in memory self.max_sequences = int( math.floor((self._get_max_number_sequences( length, self.target_block_length, self.number_of_targets)))) if self.max_sequences * length / self.shared_x > self.internal_limit: self.max_sequences = int(self.internal_limit / self.shared_x * length) #self.logger.info("Maximum number of seqs in memory: {} {}".format(self.max_sequences, self.number_of_targets)) # set parameters for this run if self.max_sequences + index >= len(records_seqs): self.max_sequences = len(records_seqs) - index elif self.max_sequences + index >= len(records_seqs): self.max_sequences = len(records_seqs) - index cont = False if self.max_sequences > self.internal_limit: self.max_sequences = self.internal_limit cont = True self._init_sw(length, self.target_block_length, self.max_sequences, self.number_of_targets) self.has_been_compiled = False elif self.settings.recompile == "F": if not self.has_been_compiled: self._clear_memory() self.max_sequences = int( math.floor((self._get_max_number_sequences( length, self.target_block_length, self.number_of_targets)))) if self.max_sequences * length / self.shared_x > self.internal_limit: self.max_sequences = int(self.internal_limit / self.shared_x * length) if self.max_sequences + index >= len(records_seqs): self.max_sequences = len(records_seqs) - index self._init_sw(length, self.target_block_length, self.max_sequences, self.number_of_targets) # add sequences to the list self.added_dummy_seqs = 0 sequenceStr = [] self.added_dummy_seqs = 0 self.min_score_np = numpy.zeros(self.max_sequences * self.number_of_targets, dtype=numpy.float32) for i in range(self.max_sequences): if i + index < len(records_seqs): sequenceStr.append( SWSeq.extentToFillGPU(str(records_seqs[i + index].seq), length)) else: sequenceStr.append( SWSeq.extentToFillGPU(SWSeq.SPECIAL_CHAR * length, length)) self.added_dummy_seqs += 1 self._set_max_possible_score(target_index, targets, i, index, records_seqs) self._copy_min_score() # copy sequences and targets to the device sequence_array = numpy.array(''.join(sequenceStr), dtype=numpy.character) self.logger.debug("At sequence: {0} of {1}, length = {2}".format( index, len(records_seqs), self.max_sequences)) # set lengths of this run prev_seq_length = length prev_target_length = self.target_block_length if self.settings.recompile == "F": self._set_parameters( length, self.target_block_length, self.max_sequences - self.added_dummy_seqs, self.number_of_targets - self.added_dummy_targets) #self._set_parameters(length, self.target_block_length, self.max_sequences, self.number_of_targets) # copy sequences and targets to the device t = time.time() self.copy_sequences(sequence_array, self.target_array) # initialize index for zero copy of starting points self._init_zero_copy() # calculate scores of alignments self._calculate_score() # perform the traceback self._traceback_host() # TODO: change to returning a value, change _print_alignments to getAlignments in SmithWaterman # TODO: move _print_alignments to here? This should be a statement to retrieve the results and # put them into a Hitlist (?) #hitlist = self._print_alignments(records_seqs, targets, index, target_index) self._print_alignments(records_seqs, targets, index, target_index, hitlist) self.logger.info( "Time spent on Smith-Waterman > {}".format(time.time() - t)) index += self.max_sequences return hitlist
class Aligner(object): ''' Common functionality shared by all aligning programs from the pyPaSWAS suite is encapsulated in this class. ''' # TODO: expand - and update - docstrings!! def __init__(self, logger, score, settings): ''' Constructor. @param logger: an instance of logging.logger @param score: a score object. None by default @param settings: an instance of a settings object as returned by optparse. These settings should include: filterFactor: Defaults to 0.7 device: Defaults to 0 i.e. the first card in the array #limitLength. Defaults to 5000. Sets a limit to the maximal length of items to be be compared at one time maxGenomeLength. Sets the maximum length of (the part of) the sequence that will be processed at one time. Should the length of the sequence exceed this setting, the sequence will be divided and the resulting parts will be processed separately. Defaults to 200000. @raise InvalidOptionException: If either verifySettings of verifyArguments fail. ''' logger.debug('Initializing aligner...') self.logger = logger self.score = score self.hitlist = HitList(self.logger) logger.debug('Setting SW...') self.settings = settings if (self.settings.framework.upper() == 'OPENCL'): if(self.settings.device_type.upper() == 'GPU'): if(self.settings.platform_name.upper() == 'NVIDIA'): self.logger.debug('Using OpenCL NVIDIA implementation') from pyPaSWAS.Core.SmithWatermanOcl import SmithWatermanNVIDIA self.smith_waterman = SmithWatermanNVIDIA(self.logger, self.score, settings) else: self.logger.debug('Using OpenCL GPU implementation') from pyPaSWAS.Core.SmithWatermanOcl import SmithWatermanGPU self.smith_waterman = SmithWatermanGPU(self.logger, self.score, settings) elif(self.settings.device_type.upper() == 'CPU'): self.logger.debug('Using OpenCL CPU implementation') from pyPaSWAS.Core.SmithWatermanOcl import SmithWatermanCPU self.smith_waterman = SmithWatermanCPU(self.logger, self.score, settings) elif(self.settings.device_type.upper() == 'ACCELERATOR'): self.logger.debug('Using OpenCL Accelerator implementation') from pyPaSWAS.Core.SmithWatermanOcl import SmithWatermanGPU self.smith_waterman = SmithWatermanGPU(self.logger, self.score, settings) else: self.logger.debug('Unknown settings for device. Using default OpenCL GPU implementation') from pyPaSWAS.Core.SmithWatermanOcl import SmithWatermanGPU self.smith_waterman = SmithWatermanGPU(self.logger, self.score, settings) elif self.settings.framework.upper() == 'CUDA': self.logger.debug('Using CUDA implementation') from pyPaSWAS.Core.SmithWatermanCuda import SmithWatermanCuda self.smith_waterman = SmithWatermanCuda(self.logger, self.score, settings) else: self.logger.info('Unknown settings for framework. Using OpenCL GPU implementation as default') from pyPaSWAS.Core.SmithWatermanOcl import SmithWatermanGPU self.smith_waterman = SmithWatermanGPU(self.logger, self.score, settings) self.logger.debug('Aligner initialized.') def process(self, records_seqs, targets, pypaswas): '''This methods sends the target- and query sequences to the SmithWaterman instance and receives the resulting hitlist. ''' # step through the targets self.logger.debug('Aligner processing...') target_index = 0 while target_index < len(targets): self.logger.debug('At target: {0} of {1}'.format(target_index, len(targets))) last_target_index = self.smith_waterman.set_targets(targets, target_index) # results should be a Hitlist() results = self.smith_waterman.align_sequences(records_seqs, targets, target_index) self.hitlist.extend(results) target_index = last_target_index self.logger.debug('Aligner processing OK, returning hitlist ({} + {}).'.format(len(self.hitlist.real_hits), len(results.real_hits))) return self.hitlist