Пример #1
0
	def _run_hhsearch(self, sequence, min_prob=0.5):
		temp = tempfile.NamedTemporaryFile(mode='w+t')
		temp.writelines(">seq\n{}\n".format(sequence))
		temp.seek(0)
		fn = temp.name
		cmd = f'{self.hhsearch_loc} -i {fn} -d {self._path}/utils/hhdb/core -n 1'
		result = subprocess.call(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
		temp.close()
		if result == 0:
			out_fn = f'{fn}.hhr'
			parser = HHOutputParser()
			hits = {i: (hit.qstart, hit.qend, hit.probability) for i, hit in
					enumerate(parser.parse_file(out_fn)) if hit.probability >= min_prob}
			os.remove(out_fn)

			# Choose highest prob hit from overlapping hits
			hits_nr = {}
			for (beg, end, prob) in hits.values():
				found_overlap = False
				res_set = {i for i in range(beg, end + 1)}
				for key, hit in hits_nr.items():
					hit_set = {i for i in range(hit[0], hit[1] + 1)}
					if len(hit_set & res_set) >= 0:
						if prob > hit[2]:
							hits_nr[key] = (beg, end, prob)
						found_overlap = True
						break
				if not found_overlap:
					hits_nr[len(hits_nr)] = (beg, end, prob)
			probs = [0]*len(sequence)
			for (beg, end, prob) in hits_nr.values():
				for i in range(beg, end+1):
					probs[i] = prob
			return hits_nr, probs
		return {}, ()
Пример #2
0
 def setUp(self):
     
     super(TestHHOutputParser, self).setUp()
     
     filename = self.config.getTestFile('d1ea0a1.hhr')
     content = open(filename).read()
     tmp = HHOutputParser(True)
     
     self.hitlist = tmp.parse_file(filename)
     self.hitlist2 = tmp.parse_string(content)
Пример #3
0
    def setUp(self):

        super(TestHHOutputParser, self).setUp()

        filename = self.config.getTestFile('d1ea0a1.hhr')
        content = open(filename).read()
        tmp = HHOutputParser(True)

        self.hitlist = tmp.parse_file(filename)
        self.hitlist2 = tmp.parse_string(content)
Пример #4
0
def parse_hhr(dir="./"):
    HHRS = glob.glob(dir + "*.hhr")

    parser = HHOutputParser(False)

    keys = set([])
    links = []

    for hhr in HHRS:
        results = parser.parse_file(hhr)
        this = results._query_name
        for hit in results:
            if hit._id == this or hit._evalue > EVAL_CUTOFF:
                continue
            links.append([this, hit._id, hit._evalue])
            # print "{}\t{}\t{}".format(this, hit._id,hit._evalue)
            keys.add(this)
            keys.add(hit._id)
    return links, list(keys)
Пример #5
0
def generate_msa(hhr_file, queryseq, hitslist, maxevalue=1e-3, ident_cut=0.5, qcov_cut=0.5, eval_cut=1e-3):
	
	assert len(hitslist) > 0, 'provide at least one hit id in `hitlist`'

	fasta = [queryseq]

	for hit in HHOutputParser(alignments=True).parse_file(hhr_file):
	
		hit_id = f'{hit.id}_{hit.qstart}_{hit.qend}'
		if not hit_id in hitslist: continue
	
		query_cov = 1.*len(hit.alignment.subject.replace('-', ''))
		if hit.identity < ident_cut: continue
		if query_cov / len(queryseq) < qcov_cut: continue
		if hit.evalue > eval_cut: continue
		
		temp = ''
		mpos = 0
		sbjct = "-"*(hit.qstart-1) + hit.alignment.subject
		
		# for each aa in sbjct
		for i in range(len(sbjct)):
		
			# no insertion at this position
			if i - hit.qstart + 1 < 0 or hit.alignment.query[i - hit.qstart+ 1 ] != '-': 
				while fasta[0][mpos] == '-':
					mpos = mpos + 1
					temp = temp + '-'      
				temp = temp + str(sbjct[i])
				mpos = mpos + 1    
				
			# insertion present
			else: 
				if fasta[0][mpos] != '-':
					for f in range(len(fasta)): # we need to add a gap
						fasta[f] = fasta[f][:mpos] + "-" + fasta[f][mpos:]		
				temp = temp + str(sbjct[i])	
				mpos = mpos + 1
			
		fasta.append(temp)

	# fill gaps at the N terminus
	for f in range(len(fasta)):
		if len(fasta[f])<len(fasta[0]):
			fasta[f]=fasta[f]+ "-" * (len(fasta[0])-len(fasta[f]))

	return fasta    
Пример #6
0
    def get_alignment(self, query: str, no: str) -> HHpredHitAlignment:
        """ Obtain the HHS alignment 'no' for query 'query'.
        Only the alignment from the fragment region is retrieved.
        This implies that when the fragment is not located in the
        N-terminus hit.q_start and the position in the output won't
        be the same. For example, if q_start = 20, that aminoacid is
        in position 0 in aln.query.

        :param query: str. Domain query
        :param no: int. Specifies the position in the file (alignment with subject)

        :return: HHpredHitAlignment. Alignment between query and subject for the fragment region.
        """

        hhF = get_FUZZLE_hhs(query)
        try:
            hh = HHOutputParser().parse_file(hhF)
            pair = hh[int(no) - 1]
            aln = pair.alignment
            return aln
        except Exception as e:
            logger.error(f"Parsing of {hhF} failed. Error follows: {e}")
Пример #7
0
def HHSearch_parseTo_DMandNX(hhrs, labels=None):
    clusternames = []
    for i, hhr in enumerate(hhrs):
        try:
            profile = HHOutputParser(alignments=False).parse_file(hhr)
            if profile.query_name not in clusternames or labels != None:
                if labels == None:
                    clusternames.append(profile.query_name)
                else:
                    clusternames.append(labels[i])
        except:
            print(hhr)
            pass

    print(clusternames)
    evalDM = np.ones((len(clusternames), len(clusternames)))
    pvalDM = np.ones((len(clusternames), len(clusternames)))
    scoreDM = np.ones((len(clusternames), len(clusternames)))
    SSDM = np.ones((len(clusternames), len(clusternames)))
    probaDM = np.zeros((len(clusternames), len(clusternames)))
    lenDM = np.ones((len(clusternames), len(clusternames)))
    covDM = np.ones((len(clusternames), len(clusternames)))
    NX = nx.Graph()
    for i, hhr in enumerate(hhrs):
        protlist = []
        profile = HHOutputParser(alignments=False).parse_file(hhr)
        for hit in profile:
            DMscore = float(hit.evalue)
            proba = hit.probability

            if 'anchor' not in hit.id and 'anchor' not in profile.query_name:
                i = clusternames.index(hit.id.strip())
                j = clusternames.index(profile.query_name.strip())

                covq = hit.qlength / (hit.qend - hit.qstart)
                covDM[i, j] = min([covq, covDM[i, j]])

                if hit.evalue < evalDM[i, j]:
                    evalDM[i, j] = hit.evalue
                    evalDM[j, i] = evalDM[i, j]

                if hit.pvalue < pvalDM[i, j]:
                    pvalDM[i, j] = hit.pvalue
                    pvalDM[j, i] = pvalDM[i, j]

                if scoreDM[i, j] < hit.score:
                    scoreDM[i, j] = hit.score
                    scoreDM[j, i] = scoreDM[i, j]

                if SSDM[i, j] < hit.ss_score:
                    SSDM[i, j] = hit.ss_score
                    SSDM[j, i] = SSDM[i, j]

                if probaDM[i, j] < hit.probability:
                    probaDM[i, j] = hit.probability
                    probaDM[j, i] = probaDM[i, j]

                #use smallest of the two prots
                if lenDM[i, j] == 1 or lenDM[i, j] > hit.qlength:
                    lenDM[i, j] = hit.qlength
                    lenDM[j, i] = lenDM[i, j]
            if hit.id != profile.query_name:
                dico = {}
                dico['score'] = scoreDM[i, j]
                dico['prob'] = probaDM[i, j]
                dico['eval'] = evalDM[i, j]
                dico['ss'] = SSDM[i, j]
                dico['length'] = lenDM[i, j]
                dico['qend'] = covDM[i, j]
                dico['qstart'] = covDM[i, j]
                dico['end'] = covDM[i, j]
                dico['start'] = covDM[i, j]
                dico['end'] = covDM[i, j]
                NX.add_edge(hit.id, profile.query_name, dict=dico)
                NX.nodes[hit.id]['len'] = hit.slength
                NX.nodes[profile.query_name]['len'] = hit.qlength
                NX.nodes[profile.query_name]['file'] = hhr
    return probaDM, evalDM, pvalDM, lenDM, scoreDM, SSDM, covDM, NX, clusternames
Пример #8
0
    def setUp(self):

        super(TestHHOutputRegressions, self).setUp()

        filename = self.config.getTestFile('d1ea0a1.hhr')
        self.hitlist = HHOutputParser().parse_file(filename)