def load_organism(self, fp, orgGenomeDB=None): with open(fp, 'r') as infile: genomeID = str(os.path.basename(fp).split(".")[0]) if orgGenomeDB == None: orgGenomeDB = GenomeDB( os.path.dirname(fp) + "../genomes/" + genomeID + ".fa") for line in infile: ret = DiamondResult.from_line(line, genomeID, genomeID) if ret.identity < 0.95: continue if ret.subject.seqid == ret.query.seqid: continue subjSeq = orgGenomeDB.get_sequence(genomeID, ret.subject.seqid) querySeq = orgGenomeDB.get_sequence(genomeID, ret.query.seqid) if subjSeq == None or querySeq == None: print("could not find one of the two sequences", genomeID, ret.subject, ret.query) partialSQ = (len(subjSeq) / len(querySeq)) partialQS = (len(querySeq) / len(subjSeq)) partialSQok = 0.95 < partialSQ and partialSQ < 1.05 partialQSok = 0.95 < partialQS and partialQS < 1.05 if not partialQSok and not partialSQok: continue self.add_gene_duplication(genomeID, ret.subject.seqid, ret.query.seqid)
homDB = HomologyDatabase.loadFromFile( "/home/proj/projekte/dataintegration/hpyloriDB/hpp12.homdb") genDB = GenomeDB(genomeLocation) for homGroup in homDB.homologies: entries = homDB.homologies[homGroup] allSeqs = [] for seqID in entries: if not seqID[0] in genDB.genomes: genDB.loadGenome(genomeLocation + "/" + seqID[0] + ".gb") seq = genDB.get_sequence(seqID[0], seqID[1]) allSeqs.append(seq) if len(allSeqs) == 0: continue seqEnds = [] for seq in allSeqs: startSeq = max(len(seq) - 10, 0) endSeq = len(seq) seqEnds.append(seq[startSeq:endSeq])
class HomologyBuilder: def __init__(self, basePath, inputFormat="embl", inputExtension='.gb'): self.basePath = basePath self.genomeInputExtension = inputExtension self.genomeDB = GenomeDB(self.basePath, fileFormat=inputFormat, fileExtension=inputExtension) self.homolDB = HomologyDatabase() self.geneDupDB = GeneDuplicationDB() def printResult(self, result): qseq = self.genomeDB.get_sequence(result.query.genome, result.query.seqid) sseq = self.genomeDB.get_sequence(result.subject.genome, result.subject.seqid) print(result.query, result.subject, result.identity, self.makeScore(result)) print(len(qseq), qseq) print(len(sseq), sseq) def makeScore(self, result): iden = float(result.identity) qseq = self.genomeDB.get_sequence(result.query.genome, result.query.seqid) sseq = self.genomeDB.get_sequence(result.subject.genome, result.subject.seqid) length = (len(result) / len(qseq)) + (len(result) / len(sseq)) return (4 * iden + length) / 6.0 def getIDObj(self, edge, vertex): diamondResult = edge.props['info'] if vertex.name == (diamondResult.query.genome, diamondResult.query.seqid): return diamondResult.query if vertex.name == (diamondResult.subject.genome, diamondResult.subject.seqid): return diamondResult.subject return None def getNonIDObj(self, edge, vertex): diamondResult = edge.props['info'] if vertex.name == (diamondResult.query.genome, diamondResult.query.seqid): return diamondResult.subject if vertex.name == (diamondResult.subject.genome, diamondResult.subject.seqid): return diamondResult.query return None def analyse(self): for file in glob.glob(self.basePath + "/alignments/*.aliout"): query2result = defaultdict(list) subject2result = defaultdict(list) filebase = os.path.basename(file) afile = filebase.split('.') subjectGenome = afile[0] queryGenome = afile[1] fileName = filebase #wantedGenomes = ['AE000511', 'CP001217', 'AE001439', 'CP001173'] wantedGenomes = None if wantedGenomes != None and not queryGenome in wantedGenomes: continue if wantedGenomes != None and not subjectGenome in wantedGenomes: continue if queryGenome == subjectGenome: continue self.genomeDB.loadGenome(self.basePath + "/genomes/" + queryGenome + self.genomeInputExtension) self.genomeDB.loadGenome(self.basePath + "/genomes/" + subjectGenome + self.genomeInputExtension) dupfiles = [ self.basePath + "/alignments/" + queryGenome + "." + queryGenome + ".aliout", self.basePath + "/alignments/" + subjectGenome + "." + subjectGenome + ".aliout" ] canContinue = True for x in dupfiles: if not os.path.isfile(x): print("Not a file", x) canContinue = False if not canContinue: continue self.geneDupDB.load_organism(dupfiles[0], self.genomeDB) self.geneDupDB.load_organism(dupfiles[1], self.genomeDB) #print(str(self.geneDupDB)) #print(file) with open(file, 'r') as infile: for line in infile: ret = DiamondResult.from_line(line, queryGenome, subjectGenome) if ret == None: continue if self.geneDupDB.has_gene_duplication( ret.query.genome, ret.query.seqid): commonIDs = self.geneDupDB.get_gene_duplication( ret.query.genome, ret.query.seqid) if ret.query.seqid != commonIDs[0]: ret.query.seqid = commonIDs[0] continue if self.geneDupDB.has_gene_duplication( ret.subject.genome, ret.subject.seqid): commonIDs = self.geneDupDB.get_gene_duplication( ret.subject.genome, ret.subject.seqid) if ret.subject.seqid != commonIDs[0]: ret.subject.seqid = commonIDs[0] continue # todo we expect not to loose anything, but one should check that beforehand ... query2result[ret.query.seqid].append(ret) subject2result[ret.subject.seqid].append(ret) for seqid in query2result: allResults = query2result[seqid] allResults = sorted(allResults, key=lambda x: self.makeScore(x), reverse=True) query2result[seqid] = allResults for seqid in subject2result: allResults = subject2result[seqid] allResults = sorted(allResults, key=lambda x: self.makeScore(x), reverse=True) subject2result[seqid] = allResults graph = Graph() for seqid in query2result: results = query2result[seqid] if len(results) == 0: continue query = results[0].query queryVert = Vertex( query.idtuple(), { 'sequence': self.genomeDB.get_sequence(query.genome, query.seqid) }) graph.add_vertex_if_not_exists(queryVert) for result in results: if len(result) < 20: continue subjVert = Vertex( result.subject.idtuple(), { 'sequence': self.genomeDB.get_sequence(result.subject.genome, result.subject.seqid) }) subjVert = graph.add_vertex_if_not_exists(subjVert) myedge = graph.add_edge(queryVert, subjVert, {'info': result}, True) for vertexID in graph.vertices: vertex = graph.vertices[vertexID] vertex.neighbors = sorted( vertex.neighbors, key=lambda x: self.getNonIDObj(x, vertex).seqid) """ STEP 1: REMOVE EMPTY NODES (IF EXIST) """ graphClean = graphCleaner(graph, None) graphClean.analyse() """ STEP 2: FIND EASY MATCHES """ oneHitConfig = OneHitHomologsConfig() stepOneHits = oneHitHomologs(graph, self.genomeDB, oneHitConfig) homolResults = stepOneHits.analyse() homolResults.toDataBase(self.homolDB) """ STEP 2.1: multiple hits, but one very high scoring """ """ One of multiple. If excellent hit found, only allow that hit. """ one2mulHitsConfig = oneMultipleConfig() one2mulHitsConfig.minQueryLength = 0.9 one2mulHitsConfig.minSubjectLength = 0.9 one2mulHitsConfig.minIdentity = 0.6 one2mulHitsConfig.allowMultiple = False one2mulHits = oneMultipleHomologs(graph, self.genomeDB, one2mulHitsConfig) homolResults = one2mulHits.analyse() homolResults.toDataBase(self.homolDB) """ One of multiple. If excellent hit found, allow that one gene is homologous to many other """ one2mulHitsConfig = oneMultipleConfig() one2mulHitsConfig.minQueryLength = 0.9 one2mulHitsConfig.minSubjectLength = 0.9 one2mulHitsConfig.allowMultiple = True one2mulHits = oneMultipleHomologs(graph, self.genomeDB, one2mulHitsConfig) homolResults = one2mulHits.analyse() homolResults.toDataBase(self.homolDB) """ One of multiple, allow non-excellent hits """ one2mulHitsConfig = oneMultipleConfig() one2mulHitsConfig.allowMultiple = False one2mulHits = oneMultipleHomologs(graph, self.genomeDB, one2mulHitsConfig) homolResults = one2mulHits.analyse() homolResults.toDataBase(self.homolDB) """ STEP 3: One sequence, multiple sequences map """ many2one = ManyToOneCombination(graph, self.genomeDB) retRes = many2one.analyse() retRes.toDataBase(self.homolDB) """ STEP 3.1: try to use a subset to get good coverage! """ greedyConfig = GreedyCombinationConfig() greedyConfig.sortingFunctionAssembly = lambda x: x.props['info' ].identity greedyCreator = GreedyCombinationCreator(graph, self.genomeDB, greedyConfig) retRes = greedyCreator.analyse() retRes.toDataBase(self.homolDB) greedyConfig = GreedyCombinationConfig() greedyConfig.minExplainedThreshold = 0.5 greedyConfig.allowTargetOverlaps = True greedyCreator = GreedyCombinationCreator(graph, self.genomeDB, greedyConfig) retRes = greedyCreator.analyse() retRes.toDataBase(self.homolDB) greedyConfig = GreedyCombinationConfig() greedyConfig.minExplainedThreshold = 0.55 greedyCreator = GreedyCombinationCreator(graph, self.genomeDB, greedyConfig) retRes = greedyCreator.analyse() retRes.toDataBase(self.homolDB) """ STEP 5: multiple sequences form a cluster """ mulCombAnalysisConfig = MultiCombinationCreatorConfig() mulCombAnalysis = MultiCombinationCreator(graph, self.genomeDB, mulCombAnalysisConfig) mulCombResult = mulCombAnalysis.analyse() mulCombResult.toDataBase(self.homolDB) """ STEP 4: one sequence, one or multiple sequences align, accept also rather bad identity """ omConfig = oneMultipleConfig() omConfig.allowPartialLength = True omConfig.betterEdgeCheck = True omConfig.allowMultiple = False omConfig.minIdentity = 0.4 omConfig.minQueryLength = 0.8 omConfig.minSubjectLength = 0.8 def checkEdge(config, edge, source, target): queryLength = config.get_seq_fraction(edge, source) subjectLength = config.get_seq_fraction(edge, target) considerEdge = queryLength > config.minQueryLength considerEdge = considerEdge and subjectLength > config.minSubjectLength considerEdge = considerEdge and edge.props[ 'info'].identity > config.minIdentity considerEdge = considerEdge and min( [queryLength, subjectLength]) > 0.5 return considerEdge omConfig.considerEdgeFunc = checkEdge omAnalysis = oneMultipleHomologs(graph, self.genomeDB, omConfig) retRes = omAnalysis.analyse() retRes.toDataBase(self.homolDB) """ extremely long sequences > 500! """ def checkEdgeLong(config, edge, source, target): edgeInfo = edge.props['info'] minSeqLength = min([ len(source.props['sequence']), len(target.props['sequence']) ]) queryLength = config.get_seq_fraction(edge, source) subjectLength = config.get_seq_fraction(edge, target) if edgeInfo.identity * minSeqLength > 500: if edgeInfo.evalue < math.pow(10, -90): if queryLength > config.minQueryLength and subjectLength > config.minQueryLength: return True return False omConfig.minQueryLength = 0.6 omConfig.minSubjectLength = 0.6 omConfig.considerEdgeFunc = checkEdgeLong omAnalysis = oneMultipleHomologs(graph, self.genomeDB, omConfig) retRes = omAnalysis.analyse() retRes.toDataBase(self.homolDB) """ STEP 6: remove hits which make no sense """ edgeRemover = SpuriousEdgeRemover(graph, self.genomeDB) edgeRemover.analyse() """ Some relations may have been hidden by combinations """ oneHitConfig = OneHitHomologsConfig(minIDScore=0.8, minLengthScore=0.7) stepOneHits = oneHitHomologs(graph, self.genomeDB, oneHitConfig) homolResults = stepOneHits.analyse() homolResults.toDataBase(self.homolDB) """ STEP 7: Mention leftovers """ def printEdge(edge): print(edge.source.name, edge.target.name, edge.props['info']) def printGraphEdges(mygraph): sortedVerts = sorted( [x for x in mygraph.vertices], key=lambda x: len(mygraph.get_vertex(x).props['sequence']), reverse=True) seenDiamondInfos = set() for x in sortedVerts: vertex = mygraph.get_vertex(x) for edge in vertex.neighbors: diamondInfo = edge.props.get('info', None) if diamondInfo == None: continue if diamondInfo not in seenDiamondInfos: printEdge(edge) seenDiamondInfos.add(diamondInfo) printGraphEdges(graph) allDupRelations = self.geneDupDB.get_gene_relations() allDupRelations.toDataBase(self.homolDB) self.homolDB.save_to_file(self.basePath + "/homdb_pre_finalize") self.homolDB.finalize() self.homolDB.save_to_file(self.basePath + "/homdb_post_finalize") self.genomeDB.writeCSV(self.basePath + "/genome_seqs/seqs") return self.homolDB
homCluster = homDB.homologies[homID] mcCount = 0 nmcCount = 0 org2length = {} for (org, geneid) in homCluster: if org in mc: mcCount += 1 if org in nmc: nmcCount += 1 seq = genomDB.get_sequence(org, geneid) org2length[org] = seq mcLengths = set() nmcLenghts = set() mcSeqs = set() nmcSeqs = set() for org in org2length: if org in mc: seq = org2length[org] mcLengths.add(len(seq)) mcSeqs.add(seq)
(('AE000511', 'HP_1072'), ('CP001217', 'HPP12_0372')), (('AE000511', 'HP_0415'), ('CP001217', 'HPP12_1009')), (('AE000511', 'HP_1100'), ('CP001217', 'HPP12_1065')), (('AE000511', 'HP_0965'), ('CP001217', 'HPP12_0960')), (('AE000511', 'HP_0762'), ('CP001217', 'HPP12_0772'))} allGenomes = set([x[0][0] for x in diffGenePairs ]).union(set([x[1][0] for x in diffGenePairs])) for genome in allGenomes: genomeDB.loadGenome(genome) fastaLines = ([], []) for x1, x2 in diffGenePairs: seq1 = genomeDB.get_sequence(x1[0], x1[1]) seq2 = genomeDB.get_sequence(x2[0], x2[1]) if seq1 == None: print(x1) if seq2 == None: print(x2) fastaLines[0].append(">" + x1[1] + " " + x1[0] + "\n" + seq1) fastaLines[1].append(">" + x2[1] + " " + x2[0] + "\n" + seq2) print("\n".join(fastaLines[0])) print() print()