def _run_hhsearch(self, sequence, min_prob=0.5): temp = tempfile.NamedTemporaryFile(mode='w+t') temp.writelines(">seq\n{}\n".format(sequence)) temp.seek(0) fn = temp.name cmd = f'{self.hhsearch_loc} -i {fn} -d {self._path}/utils/hhdb/core -n 1' result = subprocess.call(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) temp.close() if result == 0: out_fn = f'{fn}.hhr' parser = HHOutputParser() hits = {i: (hit.qstart, hit.qend, hit.probability) for i, hit in enumerate(parser.parse_file(out_fn)) if hit.probability >= min_prob} os.remove(out_fn) # Choose highest prob hit from overlapping hits hits_nr = {} for (beg, end, prob) in hits.values(): found_overlap = False res_set = {i for i in range(beg, end + 1)} for key, hit in hits_nr.items(): hit_set = {i for i in range(hit[0], hit[1] + 1)} if len(hit_set & res_set) >= 0: if prob > hit[2]: hits_nr[key] = (beg, end, prob) found_overlap = True break if not found_overlap: hits_nr[len(hits_nr)] = (beg, end, prob) probs = [0]*len(sequence) for (beg, end, prob) in hits_nr.values(): for i in range(beg, end+1): probs[i] = prob return hits_nr, probs return {}, ()
def setUp(self): super(TestHHOutputParser, self).setUp() filename = self.config.getTestFile('d1ea0a1.hhr') content = open(filename).read() tmp = HHOutputParser(True) self.hitlist = tmp.parse_file(filename) self.hitlist2 = tmp.parse_string(content)
def parse_hhr(dir="./"): HHRS = glob.glob(dir + "*.hhr") parser = HHOutputParser(False) keys = set([]) links = [] for hhr in HHRS: results = parser.parse_file(hhr) this = results._query_name for hit in results: if hit._id == this or hit._evalue > EVAL_CUTOFF: continue links.append([this, hit._id, hit._evalue]) # print "{}\t{}\t{}".format(this, hit._id,hit._evalue) keys.add(this) keys.add(hit._id) return links, list(keys)
def generate_msa(hhr_file, queryseq, hitslist, maxevalue=1e-3, ident_cut=0.5, qcov_cut=0.5, eval_cut=1e-3): assert len(hitslist) > 0, 'provide at least one hit id in `hitlist`' fasta = [queryseq] for hit in HHOutputParser(alignments=True).parse_file(hhr_file): hit_id = f'{hit.id}_{hit.qstart}_{hit.qend}' if not hit_id in hitslist: continue query_cov = 1.*len(hit.alignment.subject.replace('-', '')) if hit.identity < ident_cut: continue if query_cov / len(queryseq) < qcov_cut: continue if hit.evalue > eval_cut: continue temp = '' mpos = 0 sbjct = "-"*(hit.qstart-1) + hit.alignment.subject # for each aa in sbjct for i in range(len(sbjct)): # no insertion at this position if i - hit.qstart + 1 < 0 or hit.alignment.query[i - hit.qstart+ 1 ] != '-': while fasta[0][mpos] == '-': mpos = mpos + 1 temp = temp + '-' temp = temp + str(sbjct[i]) mpos = mpos + 1 # insertion present else: if fasta[0][mpos] != '-': for f in range(len(fasta)): # we need to add a gap fasta[f] = fasta[f][:mpos] + "-" + fasta[f][mpos:] temp = temp + str(sbjct[i]) mpos = mpos + 1 fasta.append(temp) # fill gaps at the N terminus for f in range(len(fasta)): if len(fasta[f])<len(fasta[0]): fasta[f]=fasta[f]+ "-" * (len(fasta[0])-len(fasta[f])) return fasta
def get_alignment(self, query: str, no: str) -> HHpredHitAlignment: """ Obtain the HHS alignment 'no' for query 'query'. Only the alignment from the fragment region is retrieved. This implies that when the fragment is not located in the N-terminus hit.q_start and the position in the output won't be the same. For example, if q_start = 20, that aminoacid is in position 0 in aln.query. :param query: str. Domain query :param no: int. Specifies the position in the file (alignment with subject) :return: HHpredHitAlignment. Alignment between query and subject for the fragment region. """ hhF = get_FUZZLE_hhs(query) try: hh = HHOutputParser().parse_file(hhF) pair = hh[int(no) - 1] aln = pair.alignment return aln except Exception as e: logger.error(f"Parsing of {hhF} failed. Error follows: {e}")
def HHSearch_parseTo_DMandNX(hhrs, labels=None): clusternames = [] for i, hhr in enumerate(hhrs): try: profile = HHOutputParser(alignments=False).parse_file(hhr) if profile.query_name not in clusternames or labels != None: if labels == None: clusternames.append(profile.query_name) else: clusternames.append(labels[i]) except: print(hhr) pass print(clusternames) evalDM = np.ones((len(clusternames), len(clusternames))) pvalDM = np.ones((len(clusternames), len(clusternames))) scoreDM = np.ones((len(clusternames), len(clusternames))) SSDM = np.ones((len(clusternames), len(clusternames))) probaDM = np.zeros((len(clusternames), len(clusternames))) lenDM = np.ones((len(clusternames), len(clusternames))) covDM = np.ones((len(clusternames), len(clusternames))) NX = nx.Graph() for i, hhr in enumerate(hhrs): protlist = [] profile = HHOutputParser(alignments=False).parse_file(hhr) for hit in profile: DMscore = float(hit.evalue) proba = hit.probability if 'anchor' not in hit.id and 'anchor' not in profile.query_name: i = clusternames.index(hit.id.strip()) j = clusternames.index(profile.query_name.strip()) covq = hit.qlength / (hit.qend - hit.qstart) covDM[i, j] = min([covq, covDM[i, j]]) if hit.evalue < evalDM[i, j]: evalDM[i, j] = hit.evalue evalDM[j, i] = evalDM[i, j] if hit.pvalue < pvalDM[i, j]: pvalDM[i, j] = hit.pvalue pvalDM[j, i] = pvalDM[i, j] if scoreDM[i, j] < hit.score: scoreDM[i, j] = hit.score scoreDM[j, i] = scoreDM[i, j] if SSDM[i, j] < hit.ss_score: SSDM[i, j] = hit.ss_score SSDM[j, i] = SSDM[i, j] if probaDM[i, j] < hit.probability: probaDM[i, j] = hit.probability probaDM[j, i] = probaDM[i, j] #use smallest of the two prots if lenDM[i, j] == 1 or lenDM[i, j] > hit.qlength: lenDM[i, j] = hit.qlength lenDM[j, i] = lenDM[i, j] if hit.id != profile.query_name: dico = {} dico['score'] = scoreDM[i, j] dico['prob'] = probaDM[i, j] dico['eval'] = evalDM[i, j] dico['ss'] = SSDM[i, j] dico['length'] = lenDM[i, j] dico['qend'] = covDM[i, j] dico['qstart'] = covDM[i, j] dico['end'] = covDM[i, j] dico['start'] = covDM[i, j] dico['end'] = covDM[i, j] NX.add_edge(hit.id, profile.query_name, dict=dico) NX.nodes[hit.id]['len'] = hit.slength NX.nodes[profile.query_name]['len'] = hit.qlength NX.nodes[profile.query_name]['file'] = hhr return probaDM, evalDM, pvalDM, lenDM, scoreDM, SSDM, covDM, NX, clusternames
def setUp(self): super(TestHHOutputRegressions, self).setUp() filename = self.config.getTestFile('d1ea0a1.hhr') self.hitlist = HHOutputParser().parse_file(filename)