def align(self, input_): if self.clustalw_exe is None: raise InitError("clustalw executable is not set") input_ = self._fix_input(input_) input_path = tempfile.mktemp() output_path = tempfile.mktemp() write_fasta(input_path, input_) cmd = [self.clustalw_exe, '-TYPE=PROTEIN', '-OUTPUT=FASTA', '-PWMATRIX=BLOSUM', '-OUTFILE=%s' % output_path, '-INFILE=%s' % input_path] try: p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) p.wait() if p.returncode != 0: raise RuntimeError("%s for %s" % (p.stderr.read().decode('ascii'), str(input_))) return Alignment(self._fix_output(parse_fasta(output_path))) finally: for path in [input_path, output_path]: if os.path.isfile(path): os.remove(path)
def get_domain_ranges(self, sequence): if self.url is None: raise InitError("interpro url is not set") job_id = self._interpro_submit(sequence) t0 = time() while (time() - t0) < self.job_timout: status = self._interpro_status(job_id) if status in ['RUNNING', 'PENDING', 'STARTED']: sleep(self.poll_interval) elif status == 'NOT_FOUND': job_id = self._interpro_submit(sequence) else: break if status == 'RUNNING': raise ServiceError("inteproscan job timed out") elif status in ['FAILURE', 'ERROR']: raise ServiceError(self._interpro_error(job_id)) elif status != 'FINISHED': raise ServiceError("inteproscan job status = " + status) xml_str = self._interpro_result(job_id) return self._parse_interpro_ranges(xml_str)
def _get_hits(self, range_, template_id): if self.template_blast_databank is None: raise InitError("blast databank is not set") blast_hits = blaster.blastp(range_.get_sub_sequence(), self.template_blast_databank) _log.debug("{} blast hits to filter".format(len(blast_hits))) good_hits = [] for hit_id in blast_hits: for alignment in blast_hits[hit_id]: # Must shift the numbers in the blast hit, # since we used a sub-sequence. alignment.query_shift_right(range_.start) alignment.full_query_sequence = range_.sequence hit_template_id = TemplateID( alignment.get_hit_accession_code(), alignment.get_hit_chain_id()) if template_id is not None and hit_template_id != template_id: continue if template_id is None and blacklister.is_blacklisted( alignment.get_hit_accession_code()): continue if not dssp.has_secondary_structure(hit_template_id): continue if alignment.get_percentage_identity() >= get_min_identity( alignment.count_aligned_residues()): good_hits.append(alignment) return good_hits
def blastp(self, sequence, databank): if self.blastp_exe is None: raise InitError("blastp executable is not set") input_path = tempfile.mktemp() output_path = tempfile.mktemp() write_fasta(input_path, {'target': sequence}) cmd = [ self.blastp_exe, '-query', input_path, '-db', databank, '-outfmt', '5', '-out', output_path ] _log.debug("{}".format(cmd)) try: p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) p.wait() if p.returncode != 0: raise RuntimeError(p.stderr.read()) with open(output_path, 'r') as f: xml_str = f.read() finally: for path in [input_path, output_path]: if os.path.isfile(path): os.remove(path) return self._parse_alignments(xml_str, sequence, databank)
def create_model(target_sequence, target_species_id, require_resnum=None, chosen_template_id=None): target_species_id = target_species_id.upper() sequence_id = model_storage.get_sequence_id(target_sequence) lock_name = "lock_search_%s_%s_%s_%s" % (sequence_id, target_species_id, str(require_resnum), str(chosen_template_id)) if model_storage.model_dir is None: raise InitError("model directory is not set") lock_path = os.path.join(model_storage.model_dir, lock_name) with FileLock(lock_path): model_paths = model_storage.list_models(target_sequence, target_species_id, require_resnum, chosen_template_id) if len(model_paths) > 0: return select_best_model(model_paths, target_sequence, require_resnum) else: ModelLogger.get_current().clear() domain_alignments = \ domain_aligner.get_domain_alignments(target_sequence, require_resnum, chosen_template_id) if len(domain_alignments) <= 0: _log.warn("no domain alignments for target={} resnum={} template={}" .format(target_sequence, require_resnum, chosen_template_id)) return None domain_alignment = select_best_domain_alignment(domain_alignments) return modeler.build_model(target_sequence, target_species_id, domain_alignment, require_resnum)
def blacklist(self, pdbid): if self.file_path is None: raise InitError("blacklist file not set") if not is_blacklisted(pdbid): with open(self.file_path, 'a') as f: f.write('%s\n' % pdbid)
def list_all_models(self): if self.model_dir is None: raise InitError("model directory is not set") wildcard = os.path.join(self.model_dir, "*.tgz") paths = glob(wildcard) paths = [path for path in paths if '_error' not in path] return paths
def list_models(self, target_sequence, species_id, required_resnum=None, template_id=None): if self.model_dir is None: raise InitError("model directory is not set") elif not os.path.isdir(self.model_dir): raise InitError("No such directory: {}".format(self.model_dir)) sequence_id = self.get_sequence_id(target_sequence) species_id = species_id.upper() if template_id is None: wildcard = "%s_%s_*.tgz" % (sequence_id, species_id) else: case_insensitive_pdbid = "" for i in range(len(template_id.pdbid)): char = template_id.pdbid[i] if char.isalpha(): case_insensitive_pdbid += '[%s%s]' % (char.lower(), char.upper()) else: case_insensitive_pdbid += char wildcard = "%s_%s_*_%s-%s.tgz" % (sequence_id, species_id, case_insensitive_pdbid, template_id.chain_id) wildcard = os.path.join(self.model_dir, wildcard) paths = glob(wildcard) paths = [path for path in paths if '_error' not in path] if required_resnum is None: return paths else: matching_paths = [] for path in paths: if self.model_covers(path, target_sequence, required_resnum): matching_paths.append(path) return matching_paths
def is_blacklisted(self, pdbid): if self.file_path is None: raise InitError("blacklist file not set") if os.path.isfile(self.file_path): with open(self.file_path, 'r') as f: list_ = f.read().split() return pdbid in list_ return False
def get_model_lock(self, main_target_sequence, target_species_id, main_domain_alignment, template_id): if self.model_dir is None: raise InitError("model directory is not set") lock_name = 'lock_model_' + self.get_model_name( main_target_sequence, target_species_id, main_domain_alignment, template_id) lock_path = os.path.join(self.model_dir, lock_name) return FileLock(lock_path)
def get_sequence(self, ac): if self.fasta_paths is None: raise InitError("fasta paths not set") for fasta_path in self.fasta_paths: with FastaIterator(fasta_path) as fasta: for id_, seq in fasta: if id_.split('|')[1] == ac: return seq raise ValueError("sequence not found in uniprot: {}".format(ac))
def build_model(self, main_target_sequence, target_species_id, main_domain_alignment, require_resnum=None): ModelLogger.get_current().add("building model with sequence {}, species {}, alignment {} and resnum {}" .format(main_target_sequence, target_species_id, main_domain_alignment, require_resnum)) tar_path = model_storage.get_tar_path(main_target_sequence, target_species_id, main_domain_alignment, main_domain_alignment.template_id) with model_storage.get_model_lock(main_target_sequence, target_species_id, main_domain_alignment, main_domain_alignment.template_id): if not os.path.isfile(tar_path): if self.yasara_dir is None: raise InitError("yasara dir is not set") with ModelingContext(self.yasara_dir) as context: self._prepare_template(context, main_domain_alignment.template_id.pdbid) # If the template is the same as the target, do no modeling: if main_domain_alignment.get_template_sequence() == context.get_sequence(main_domain_alignment.template_id.chain_id) and \ main_domain_alignment.get_percentage_identity() >= 100.0: main_domain_alignment.target_id = model_storage.get_sequence_id(main_target_sequence) tar_path = self._wrap_template(main_target_sequence, target_species_id, main_domain_alignment, main_domain_alignment.template_id) return tar_path context.set_main_target(main_target_sequence, target_species_id, main_domain_alignment.template_id.chain_id) chain_alignments = self._make_alignments(main_target_sequence, target_species_id, main_domain_alignment, context, require_resnum) # Delete chains that aren't in the alignment set: for chain_id in context.get_chain_ids(): if chain_id not in chain_alignments: context.delete_chain(chain_id) _log.debug("final alignments: {}".format([(chain_id, chain_alignments[chain_id]) for chain_id in context.get_chain_ids()])) _log.debug("final template {} {}".format(context.template_pdbid, [(chain_id, context.get_sequence(chain_id)) for chain_id in context.get_chain_ids()])) tar_path = self._model_run(main_domain_alignment, chain_alignments, context, main_target_sequence, require_resnum) return tar_path
def _get_hits(self, range_, template_id): if self.template_blast_databank is None: raise InitError("blast databank is not set") blast_hits = blaster.blastp(range_.get_sub_sequence(), self.template_blast_databank) _log.debug("{} blast hits to filter".format(len(blast_hits))) count_template_hits = 0 good_hits = [] for hit_id in blast_hits: for alignment in blast_hits[hit_id]: hit_template_id = TemplateID(alignment.get_hit_accession_code(), alignment.get_hit_chain_id()) if template_id is not None and hit_template_id != template_id: continue count_template_hits += 1 if template_id is None and blacklister.is_blacklisted(alignment.get_hit_accession_code()): continue if not dssp.has_secondary_structure(hit_template_id): continue # Replace the blast hit's alignment with the kmad alignment. template_secstr = dssp.get_secondary_structure(hit_template_id) template_sequence = dssp.get_sequence(hit_template_id) try: kmad_alignment = kmad_aligner.align(template_sequence, template_secstr, range_.get_sub_sequence()) except: _log.warn(traceback.format_exc()) # If kmad fails, then skip this one :( continue alignment.full_query_sequence = range_.sequence alignment.query_start = range_.start + 1 alignment.query_end = range_.end alignment.subject_start = 1 alignment.subject_end = len(template_sequence) alignment.query_alignment = kmad_alignment.target_alignment alignment.subject_alignment = kmad_alignment.template_alignment if alignment.get_percentage_identity() >= get_min_identity(alignment.count_aligned_residues()): good_hits.append(alignment) if count_template_hits == 0 and template_id is not None: _log.warning("domain sequence {} has no suitable hits with {}".format(range_.get_sub_sequence(), template_id)) return [] return good_hits
def get_sequence(self, ac): if self.url is None: raise InitError("uniprot url is not set") fasta_url = self.url + '/' + ac + '.fasta' _log.debug(fasta_url) r = requests.get(fasta_url) while r.status_code == 500: r = requests.get(fasta_url) r.raise_for_status() fa = parse_fasta_from_string(r.text) return fa.values()[0]
def _filter_forbidden_ranges(self, ranges): if self.forbidden_interpro_domains is None: raise InitError("forbidden ranges not set") forbidden = [] for range_ in ranges: if range_.ac in self.forbidden_interpro_domains: forbidden.append(range_) passed = [] for range_ in ranges: overlapping = filter(lambda r: r.overlaps_with(range_), forbidden) if len(overlapping) <= 0: passed.append(range_) return passed
def _merge_similar_ranges(self, ranges): if self.similar_ranges_min_overlap_percentage is None or \ self.similar_ranges_max_length_difference_percentage is None: raise InitError("similar range percentages not set") ranges = sorted(ranges, cmp=lambda r1, r2: r1.is_left_from(r2)) i = 0 while i < len(ranges): overlapping_indices = [] for j in range(i + 1, len(ranges)): if ranges[j].overlaps_with(ranges[i]): overlapping_indices.append(j) # important, rightmost must go first! # Because we're going to remove ranges from the list. overlapping_indices = sorted(overlapping_indices, reverse=True) for j in overlapping_indices: percentage_overlap = ranges[i].get_percentage_overlap( ranges[j]) percentage_length_difference = ( (100.0 * abs(ranges[i].get_length() - ranges[j].get_length())) / max(ranges[i].get_length(), ranges[j].get_length())) if percentage_overlap > self.similar_ranges_min_overlap_percentage and \ percentage_length_difference < self.similar_ranges_max_length_difference_percentage: # Replace the two ranges by a single merged one: _log.debug( "merging {} with {}, they have {} % length difference". format(ranges[i], ranges[j], percentage_length_difference)) merged = ranges[i].merge_with(ranges[j]) ranges = (ranges[:i] + [merged] + ranges[i + 1:j] + ranges[j + 1:]) i += 1 # Make list shorter to save time: ranges = self._remove_duplicate_ranges(ranges) return ranges
def _find_target_sequences(self, template_chain_sequence, target_species_id): if self.uniprot_databank is None: raise InitError("species databank dir not set") target_sequences = {} hits = blaster.blastp(template_chain_sequence, self.uniprot_databank) for hit_id in hits: if not hit_id.endswith('_' + target_species_id.upper()): continue for alignment in hits[hit_id]: ac = alignment.get_hit_accession_code() pid = alignment.get_percentage_identity() pcov = alignment.get_percentage_coverage() if pid > 70.0: if pcov > 90.0: target_sequences[ac] = uniprot.get_sequence(ac) return target_sequences
def blastp(self, sequence, databank): if self.blastp_exe is None: raise InitError("blastp executable is not set") input_path = tempfile.mktemp() output_path = tempfile.mktemp() write_fasta(input_path, {'target': sequence}) cmd = [ self.blastp_exe, '-query', input_path, '-db', databank, '-outfmt', '5', '-out', output_path ] _log.debug("{}".format(cmd)) try: p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd='/') p.wait() if p.returncode != 0: err_msg = p.stderr.read().decode('ascii') if err_msg.startswith( "BLAST Database error: No alias or index file found for protein database" ): raise RecoverableError(err_msg) raise RuntimeError("%s for databank %s, sequence %s" % (err_msg, databank, sequence)) with open(output_path, 'r') as f: xml_str = f.read() finally: for path in [input_path, output_path]: if os.path.isfile(path): os.remove(path) return self._parse_alignments(xml_str, sequence, databank)
def list_models(self, target_sequence, species_id, required_resnum=None, template_id=None): if self.model_dir is None: raise InitError("model directory is not set") sequence_id = self.get_sequence_id(target_sequence) species_id = species_id.upper() if template_id is None: wildcard = "%s_%s_*.tgz" % (sequence_id, species_id) else: wildcard = "%s_%s_*_%s-%s.tgz" % (sequence_id, species_id, template_id.pdbid, template_id.chain_id) wildcard = os.path.join(self.model_dir, wildcard) paths = glob(wildcard) paths = [path for path in paths if '_error' not in path] if required_resnum is None: return paths else: matching_paths = [] for path in paths: name = os.path.splitext(os.path.basename(path))[0] range_ = name.split('_')[2] start, end = range_.split('-') start = int(start) end = int(end) if required_resnum >= start and required_resnum <= end: matching_paths.append(path) return matching_paths
def _prepare_context(self, template_pdbid): if self.yasara_dir is None: raise InitError("yasara dir is not set") context = ModelingContext(self.yasara_dir) self._init_template(template_pdbid, context) try: self._oligomerize_template(context) except: self._init_template(template_pdbid, context) try: self._build_template_symmetry_residues(context) except: pass self._delete_solvent_residues(context) self._fix_template_errors(context) context.yasara.CleanObj(context.template_obj) return context
def _run_kmad(self, input_path, output_path, gap_open, gap_extend, modifier): if self.kmad_exe is None: raise InitError("kmad executable is not set") cmd = [ self.kmad_exe, '-i', input_path, '-o', output_path, '-g', '%.1f' % gap_open, '-e', '%.1f' % gap_extend, '-s', '%.1f' % modifier, '-c', '4' ] _log.debug(cmd) p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) p.wait() if p.returncode != 0: raise RuntimeError(p.stderr.read().decode('ascii'))
def get_tar_path_from_name(self, name): if self.model_dir is None: raise InitError("model directory is not set") return os.path.join(self.model_dir, name + '.tgz')
def get_domain_alignments(self, target_sequence, require_resnum=None, template_id=None): if self.min_percentage_coverage is None: raise InitError("min percentage coverage is not set") interpro_ranges = interpro.get_domain_ranges(target_sequence) _log.debug("{} ranges from interpro".format(len(interpro_ranges))) sample_ranges = self._filter_forbidden_ranges(interpro_ranges) if require_resnum is not None: sample_ranges = filter( lambda r: r.includes_residue(require_resnum), sample_ranges) _log.debug("{} ranges have residue {}".format( len(sample_ranges), require_resnum)) # Add the whole sequence as a range too: sample_ranges.append( SequenceRange(0, len(target_sequence), target_sequence)) ok_ranges_alignments = {} best_ranges_alignments = {} checked_ranges = [] while len(sample_ranges) > 0: merged_sample_ranges = self._merge_similar_ranges(sample_ranges) _log.debug("sampling {} ranges".format(len(merged_sample_ranges))) # Check the largest ranges first. If that yields, then the smaller ones don't matter. for range_ in sorted(merged_sample_ranges, key=lambda r: r.get_length(), reverse=True): if range_ in checked_ranges: continue # already passed this one checked_ranges.append(range_) if any([r.encloses(range_) for r in best_ranges_alignments]): continue # we already have a larger enclosing range # These can differ per range: best_hit = None last_resort_hit = None hit_candidates = self._get_hits(range_, template_id) _log.debug('trying range: {} against {} hits'.format( range_, len(hit_candidates))) for hit_candidate in hit_candidates: hit_range = hit_candidate.get_query_range() if require_resnum is not None: if not hit_candidate.is_query_residue_covered( require_resnum): _log.debug( "hit with {} on {} does not cover residue {}". format(hit_candidate.get_hit_accession_code(), hit_range, require_resnum)) continue if self._alignment_ok_for_range(range_, hit_candidate): _log.debug("hit with {} {} is ok".format( hit_candidate.get_hit_accession_code(), hit_range)) # This range made an OK alignment, so at least store it for later usage: template_id = TemplateID( hit_candidate.get_hit_accession_code(), hit_candidate.get_hit_chain_id()) ok_ranges_alignments[hit_range] = DomainAlignment( hit_candidate.query_alignment, hit_candidate.subject_alignment, hit_range, template_id) if hit_candidate.get_percentage_coverage( ) > self.min_percentage_coverage: _log.debug( "coverage is high enough for {} {}".format( hit_candidate.get_hit_accession_code(), hit_range)) if best_hit is None or self._is_better_than( hit_candidate, best_hit): _log.debug("{} is better than {}".format( hit_candidate, best_hit)) best_hit = hit_candidate else: last_resort_hit = hit_candidate if best_hit is None: best_hit = last_resort_hit if best_hit is not None: # Remove any smaller ranges that this one encloses: best_ranges_alignments = self._remove_enclosing( range_, best_ranges_alignments) template_id = TemplateID(best_hit.get_hit_accession_code(), best_hit.get_hit_chain_id()) hit_range = best_hit.get_query_range() _log.debug( "passing best hit with template {} with range {}". format(template_id, hit_range)) best_ranges_alignments[hit_range] = DomainAlignment( best_hit.query_alignment, best_hit.subject_alignment, hit_range, template_id) else: _log.debug("no hit for range {}".format(range_)) # After iterating the sample ranges, prepare for the next round: sample_ranges = self._clean_search_space(checked_ranges, sample_ranges, ok_ranges_alignments) return best_ranges_alignments.values()