def start_app_from_args(args): global homDB global genomDB global xrefDB global opDB global sorfDB global pfamDB global tssDB global tmpfolder global clustalobin tmpfolder = args.tmp clustalobin = args.clustalo.name homDB = HomologyDatabase.loadFromFile(args.databases + "/homdb/" + "/hpdb_full_new") xrefDB = XRefDatabase(gobo=args.databases + "/obos/go.obo", fileName=args.databases + "/homdb/" + "/hpdb_full_xref") opDB = OperonDB.from_cs_operons(args.databases + "/sharma/operons.xlsx") tssDB = TSSDB.from_cs_tss(args.databases + "/sharma/tss.xlsx") sorfDB = SORFDB.from_cs_sorfs(args.databases + "/sharma/sorfs.xlsx") pfamDB = PfamResultDB.from_folder(args.databases + "/pfam/") genomDB = GenomeDB(args.genomes, loadAll=False) for orgname in homDB.get_all_organisms(): genomDB.loadGenome(orgname)
def __init__(self, basePath, inputFormat="embl", inputExtension='.gb'): self.basePath = basePath self.genomeInputExtension = inputExtension self.genomeDB = GenomeDB(self.basePath, fileFormat=inputFormat, fileExtension=inputExtension) self.homolDB = HomologyDatabase() self.geneDupDB = GeneDuplicationDB()
def load_organism(self, fp, orgGenomeDB=None): with open(fp, 'r') as infile: genomeID = str(os.path.basename(fp).split(".")[0]) if orgGenomeDB == None: orgGenomeDB = GenomeDB( os.path.dirname(fp) + "../genomes/" + genomeID + ".fa") for line in infile: ret = DiamondResult.from_line(line, genomeID, genomeID) if ret.identity < 0.95: continue if ret.subject.seqid == ret.query.seqid: continue subjSeq = orgGenomeDB.get_sequence(genomeID, ret.subject.seqid) querySeq = orgGenomeDB.get_sequence(genomeID, ret.query.seqid) if subjSeq == None or querySeq == None: print("could not find one of the two sequences", genomeID, ret.subject, ret.query) partialSQ = (len(subjSeq) / len(querySeq)) partialQS = (len(querySeq) / len(subjSeq)) partialSQok = 0.95 < partialSQ and partialSQ < 1.05 partialQSok = 0.95 < partialQS and partialQS < 1.05 if not partialQSok and not partialSQok: continue self.add_gene_duplication(genomeID, ret.subject.seqid, ret.query.seqid)
from Bio import SeqIO import argparse import sys, os sys.path.insert(0, str(os.path.dirname(os.path.realpath(__file__))) + "/../") from database.genomedb import GenomeDB if __name__ == '__main__': parser = argparse.ArgumentParser( description='Calculate kmer histograms and compare for two groups', add_help=False) parser.add_argument('-l', '--location', type=str, help='input', required=True) args = parser.parse_args() fileLocation = args.location for file in glob.glob(fileLocation + '/*.gb'): print(file) genomeDB = GenomeDB(fileLocation, loadAll=False) genomeDB.loadGenome(file, False) genomeDB.writeBLASTfastas(fileLocation)
#print(alignment[1]) for x in orgSubMatrix: orgSubMatRel[x] = orgSubMatrix[x] / aaLength print("W content") for org in orgAACounts: print(org, "W", orgAACounts[org]['W']) return orgSubMatrix, orgSubMatRel, aaLength, orgSubMatrixDir hpHomolDB = HomologyDatabase.loadFromFile(fileLocation + "/hpp12_hp") cbHomolDB = HomologyDatabase.loadFromFile(fileLocation + "../cbdb/" + "/cbj") genomeDB = GenomeDB(fileLocation + "/genomes/") genomeDB.loadGenome(fileLocation + "/genomes/CP001217.gb") genomeDB.loadGenome(fileLocation + "/genomes/AE000511.gb") genomeDB.fileExtension = '.gbff' genomeDB.fileFormat = 'gb' genomeDB.loadGenome(fileLocation + "../cbdb/genomes/NC003912.gbff") genomeDB.loadGenome(fileLocation + "../cbdb/genomes/NC002163.gbff") matrix = matlist.blosum80 subMatrix = {} print("Starting HPP")
import editdistance from database.genomedb import GenomeDB from database.homologydb import HomologyDatabase if __name__ == '__main__': genomeLocation = '/home/users/joppich/ownCloud/data/hpyloriDB/genomes/' homDB = HomologyDatabase.loadFromFile( "/home/proj/projekte/dataintegration/hpyloriDB/hpp12.homdb") genDB = GenomeDB(genomeLocation) for homGroup in homDB.homologies: entries = homDB.homologies[homGroup] allSeqs = [] for seqID in entries: if not seqID[0] in genDB.genomes: genDB.loadGenome(genomeLocation + "/" + seqID[0] + ".gb") seq = genDB.get_sequence(seqID[0], seqID[1]) allSeqs.append(seq) if len(allSeqs) == 0: continue
class HomologyBuilder: def __init__(self, basePath, inputFormat="embl", inputExtension='.gb'): self.basePath = basePath self.genomeInputExtension = inputExtension self.genomeDB = GenomeDB(self.basePath, fileFormat=inputFormat, fileExtension=inputExtension) self.homolDB = HomologyDatabase() self.geneDupDB = GeneDuplicationDB() def printResult(self, result): qseq = self.genomeDB.get_sequence(result.query.genome, result.query.seqid) sseq = self.genomeDB.get_sequence(result.subject.genome, result.subject.seqid) print(result.query, result.subject, result.identity, self.makeScore(result)) print(len(qseq), qseq) print(len(sseq), sseq) def makeScore(self, result): iden = float(result.identity) qseq = self.genomeDB.get_sequence(result.query.genome, result.query.seqid) sseq = self.genomeDB.get_sequence(result.subject.genome, result.subject.seqid) length = (len(result) / len(qseq)) + (len(result) / len(sseq)) return (4 * iden + length) / 6.0 def getIDObj(self, edge, vertex): diamondResult = edge.props['info'] if vertex.name == (diamondResult.query.genome, diamondResult.query.seqid): return diamondResult.query if vertex.name == (diamondResult.subject.genome, diamondResult.subject.seqid): return diamondResult.subject return None def getNonIDObj(self, edge, vertex): diamondResult = edge.props['info'] if vertex.name == (diamondResult.query.genome, diamondResult.query.seqid): return diamondResult.subject if vertex.name == (diamondResult.subject.genome, diamondResult.subject.seqid): return diamondResult.query return None def analyse(self): for file in glob.glob(self.basePath + "/alignments/*.aliout"): query2result = defaultdict(list) subject2result = defaultdict(list) filebase = os.path.basename(file) afile = filebase.split('.') subjectGenome = afile[0] queryGenome = afile[1] fileName = filebase #wantedGenomes = ['AE000511', 'CP001217', 'AE001439', 'CP001173'] wantedGenomes = None if wantedGenomes != None and not queryGenome in wantedGenomes: continue if wantedGenomes != None and not subjectGenome in wantedGenomes: continue if queryGenome == subjectGenome: continue self.genomeDB.loadGenome(self.basePath + "/genomes/" + queryGenome + self.genomeInputExtension) self.genomeDB.loadGenome(self.basePath + "/genomes/" + subjectGenome + self.genomeInputExtension) dupfiles = [ self.basePath + "/alignments/" + queryGenome + "." + queryGenome + ".aliout", self.basePath + "/alignments/" + subjectGenome + "." + subjectGenome + ".aliout" ] canContinue = True for x in dupfiles: if not os.path.isfile(x): print("Not a file", x) canContinue = False if not canContinue: continue self.geneDupDB.load_organism(dupfiles[0], self.genomeDB) self.geneDupDB.load_organism(dupfiles[1], self.genomeDB) #print(str(self.geneDupDB)) #print(file) with open(file, 'r') as infile: for line in infile: ret = DiamondResult.from_line(line, queryGenome, subjectGenome) if ret == None: continue if self.geneDupDB.has_gene_duplication( ret.query.genome, ret.query.seqid): commonIDs = self.geneDupDB.get_gene_duplication( ret.query.genome, ret.query.seqid) if ret.query.seqid != commonIDs[0]: ret.query.seqid = commonIDs[0] continue if self.geneDupDB.has_gene_duplication( ret.subject.genome, ret.subject.seqid): commonIDs = self.geneDupDB.get_gene_duplication( ret.subject.genome, ret.subject.seqid) if ret.subject.seqid != commonIDs[0]: ret.subject.seqid = commonIDs[0] continue # todo we expect not to loose anything, but one should check that beforehand ... query2result[ret.query.seqid].append(ret) subject2result[ret.subject.seqid].append(ret) for seqid in query2result: allResults = query2result[seqid] allResults = sorted(allResults, key=lambda x: self.makeScore(x), reverse=True) query2result[seqid] = allResults for seqid in subject2result: allResults = subject2result[seqid] allResults = sorted(allResults, key=lambda x: self.makeScore(x), reverse=True) subject2result[seqid] = allResults graph = Graph() for seqid in query2result: results = query2result[seqid] if len(results) == 0: continue query = results[0].query queryVert = Vertex( query.idtuple(), { 'sequence': self.genomeDB.get_sequence(query.genome, query.seqid) }) graph.add_vertex_if_not_exists(queryVert) for result in results: if len(result) < 20: continue subjVert = Vertex( result.subject.idtuple(), { 'sequence': self.genomeDB.get_sequence(result.subject.genome, result.subject.seqid) }) subjVert = graph.add_vertex_if_not_exists(subjVert) myedge = graph.add_edge(queryVert, subjVert, {'info': result}, True) for vertexID in graph.vertices: vertex = graph.vertices[vertexID] vertex.neighbors = sorted( vertex.neighbors, key=lambda x: self.getNonIDObj(x, vertex).seqid) """ STEP 1: REMOVE EMPTY NODES (IF EXIST) """ graphClean = graphCleaner(graph, None) graphClean.analyse() """ STEP 2: FIND EASY MATCHES """ oneHitConfig = OneHitHomologsConfig() stepOneHits = oneHitHomologs(graph, self.genomeDB, oneHitConfig) homolResults = stepOneHits.analyse() homolResults.toDataBase(self.homolDB) """ STEP 2.1: multiple hits, but one very high scoring """ """ One of multiple. If excellent hit found, only allow that hit. """ one2mulHitsConfig = oneMultipleConfig() one2mulHitsConfig.minQueryLength = 0.9 one2mulHitsConfig.minSubjectLength = 0.9 one2mulHitsConfig.minIdentity = 0.6 one2mulHitsConfig.allowMultiple = False one2mulHits = oneMultipleHomologs(graph, self.genomeDB, one2mulHitsConfig) homolResults = one2mulHits.analyse() homolResults.toDataBase(self.homolDB) """ One of multiple. If excellent hit found, allow that one gene is homologous to many other """ one2mulHitsConfig = oneMultipleConfig() one2mulHitsConfig.minQueryLength = 0.9 one2mulHitsConfig.minSubjectLength = 0.9 one2mulHitsConfig.allowMultiple = True one2mulHits = oneMultipleHomologs(graph, self.genomeDB, one2mulHitsConfig) homolResults = one2mulHits.analyse() homolResults.toDataBase(self.homolDB) """ One of multiple, allow non-excellent hits """ one2mulHitsConfig = oneMultipleConfig() one2mulHitsConfig.allowMultiple = False one2mulHits = oneMultipleHomologs(graph, self.genomeDB, one2mulHitsConfig) homolResults = one2mulHits.analyse() homolResults.toDataBase(self.homolDB) """ STEP 3: One sequence, multiple sequences map """ many2one = ManyToOneCombination(graph, self.genomeDB) retRes = many2one.analyse() retRes.toDataBase(self.homolDB) """ STEP 3.1: try to use a subset to get good coverage! """ greedyConfig = GreedyCombinationConfig() greedyConfig.sortingFunctionAssembly = lambda x: x.props['info' ].identity greedyCreator = GreedyCombinationCreator(graph, self.genomeDB, greedyConfig) retRes = greedyCreator.analyse() retRes.toDataBase(self.homolDB) greedyConfig = GreedyCombinationConfig() greedyConfig.minExplainedThreshold = 0.5 greedyConfig.allowTargetOverlaps = True greedyCreator = GreedyCombinationCreator(graph, self.genomeDB, greedyConfig) retRes = greedyCreator.analyse() retRes.toDataBase(self.homolDB) greedyConfig = GreedyCombinationConfig() greedyConfig.minExplainedThreshold = 0.55 greedyCreator = GreedyCombinationCreator(graph, self.genomeDB, greedyConfig) retRes = greedyCreator.analyse() retRes.toDataBase(self.homolDB) """ STEP 5: multiple sequences form a cluster """ mulCombAnalysisConfig = MultiCombinationCreatorConfig() mulCombAnalysis = MultiCombinationCreator(graph, self.genomeDB, mulCombAnalysisConfig) mulCombResult = mulCombAnalysis.analyse() mulCombResult.toDataBase(self.homolDB) """ STEP 4: one sequence, one or multiple sequences align, accept also rather bad identity """ omConfig = oneMultipleConfig() omConfig.allowPartialLength = True omConfig.betterEdgeCheck = True omConfig.allowMultiple = False omConfig.minIdentity = 0.4 omConfig.minQueryLength = 0.8 omConfig.minSubjectLength = 0.8 def checkEdge(config, edge, source, target): queryLength = config.get_seq_fraction(edge, source) subjectLength = config.get_seq_fraction(edge, target) considerEdge = queryLength > config.minQueryLength considerEdge = considerEdge and subjectLength > config.minSubjectLength considerEdge = considerEdge and edge.props[ 'info'].identity > config.minIdentity considerEdge = considerEdge and min( [queryLength, subjectLength]) > 0.5 return considerEdge omConfig.considerEdgeFunc = checkEdge omAnalysis = oneMultipleHomologs(graph, self.genomeDB, omConfig) retRes = omAnalysis.analyse() retRes.toDataBase(self.homolDB) """ extremely long sequences > 500! """ def checkEdgeLong(config, edge, source, target): edgeInfo = edge.props['info'] minSeqLength = min([ len(source.props['sequence']), len(target.props['sequence']) ]) queryLength = config.get_seq_fraction(edge, source) subjectLength = config.get_seq_fraction(edge, target) if edgeInfo.identity * minSeqLength > 500: if edgeInfo.evalue < math.pow(10, -90): if queryLength > config.minQueryLength and subjectLength > config.minQueryLength: return True return False omConfig.minQueryLength = 0.6 omConfig.minSubjectLength = 0.6 omConfig.considerEdgeFunc = checkEdgeLong omAnalysis = oneMultipleHomologs(graph, self.genomeDB, omConfig) retRes = omAnalysis.analyse() retRes.toDataBase(self.homolDB) """ STEP 6: remove hits which make no sense """ edgeRemover = SpuriousEdgeRemover(graph, self.genomeDB) edgeRemover.analyse() """ Some relations may have been hidden by combinations """ oneHitConfig = OneHitHomologsConfig(minIDScore=0.8, minLengthScore=0.7) stepOneHits = oneHitHomologs(graph, self.genomeDB, oneHitConfig) homolResults = stepOneHits.analyse() homolResults.toDataBase(self.homolDB) """ STEP 7: Mention leftovers """ def printEdge(edge): print(edge.source.name, edge.target.name, edge.props['info']) def printGraphEdges(mygraph): sortedVerts = sorted( [x for x in mygraph.vertices], key=lambda x: len(mygraph.get_vertex(x).props['sequence']), reverse=True) seenDiamondInfos = set() for x in sortedVerts: vertex = mygraph.get_vertex(x) for edge in vertex.neighbors: diamondInfo = edge.props.get('info', None) if diamondInfo == None: continue if diamondInfo not in seenDiamondInfos: printEdge(edge) seenDiamondInfos.add(diamondInfo) printGraphEdges(graph) allDupRelations = self.geneDupDB.get_gene_relations() allDupRelations.toDataBase(self.homolDB) self.homolDB.save_to_file(self.basePath + "/homdb_pre_finalize") self.homolDB.finalize() self.homolDB.save_to_file(self.basePath + "/homdb_post_finalize") self.genomeDB.writeCSV(self.basePath + "/genome_seqs/seqs") return self.homolDB
from analysis.homologybuilder import HomologyBuilder from database.genomedb import GenomeDB if __name__ == '__main__': fileLocation = '/mnt/c/ownCloud/data/cbdb/' initialise=False if initialise: genomDB = GenomeDB(fileLocation + "genomes/", fileFormat='gb', fileExtension='.gbff', loadAll=True) genomDB.writeBLASTfastas(fileLocation + "genomes/") exit() builder = HomologyBuilder(basePath=fileLocation, inputFormat='gb', inputExtension='.gbff') homolDB = builder.analyse() homolDB.save_to_file(fileLocation + "/cbj")
def distance(r1, r2): # sort the two ranges such that the range with smaller first element # is assigned to x and the bigger one is assigned to y x, y = sorted((r1, r2)) if x[0] <= x[1] < y[0] and all(y[0] <= y[1] for y in (r1, r2)): return y[0] - x[1] return 0 if __name__ == '__main__': fileLocation = "/mnt/c/dev/data/haas/homdb/" homDB = HomologyDatabase.loadFromFile(fileLocation + "/hpp_split") genomDB = GenomeDB(fileLocation + "/genomes", loadAll=False) allorgs = homDB.get_all_organisms() for org in allorgs: genomDB.loadGenome(org) extra = ['AE001439', 'CP009259'] mc = [ '4_N1-031C1', '2_N1-025A2', '14_1-20A_UB64', '13_N5-004A1', '3_N1-029C1', '11_N4-029C2', '10_N2-085C2', '1_N1-024A1' ] nmc = [ x for x in allorgs if not x in mc and not x in extra and not x.startswith("6_") ] # and not x.startswith("15")
import sys import os from collections import defaultdict sys.path.insert(0, str(os.path.dirname(os.path.realpath(__file__))) + "/../") import math from database.genomedb import GenomeDB from database.homologydb import HomologyDatabase from utils.utils import fileLocation if __name__ == '__main__': homolDB = HomologyDatabase.loadFromFile(fileLocation + "/hpp12_hp") genomeDB = GenomeDB(fileLocation + "/genomes/") allowedOrgs = ['CP001217', 'AE000511'] compareAA = (['W'], ['F', 'G', 'A']) compareAA = (['W'], ['H', 'F', 'Y', 'P', 'K']) #compareAA = (['W', 'M'], ['H', 'F', 'Y', 'P', 'K']) #compareAA = (['W', 'M'], ['F', 'G', 'A']) def calculateDifferences(orgI, orgJ, allAA): allDiffs = list() foundGenes = 0
import sys import os sys.path.insert(0, str(os.path.dirname(os.path.realpath(__file__))) + "/../") from database.genomedb import GenomeDB from utils.utils import fileLocation if __name__ == '__main__': genomeDB = GenomeDB(fileLocation + "/genomes/") diffGenePairs = {(('AE000511', 'HP_0868'), ('CP001217', 'HPP12_0868')), (('AE000511', 'HP_0036'), ('CP001217', 'HPP12_0032')), (('AE000511', 'HP_0963'), ('CP001217', 'HPP12_0958')), (('AE000511', 'HP_1282'), ('CP001217', 'HPP12_1248')), (('AE000511', 'HP_0568'), ('CP001217', 'HPP12_0574')), (('AE000511', 'HP_0286'), ('CP001217', 'HPP12_0285')), (('AE000511', 'HP_0519'), ('CP001217', 'HPP12_0525')), (('AE000511', 'HP_0104'), ('CP001217', 'HPP12_0106')), (('AE000511', 'HP_0091'), ('CP001217', 'HPP12_0094')), (('AE000511', 'HP_0342'), ('CP001217', 'HPP12_0337')), (('AE000511', 'HP_0656'), ('CP001217', 'HPP12_0669')), (('AE000511', 'HP_0043'), ('CP001217', 'HPP12_0038')), (('AE000511', 'HP_0108'), ('CP001217', 'HPP12_0110')), (('AE000511', 'HP_1213'), ('CP001217', 'HPP12_1179')), (('AE000511', 'HP_1105'), ('CP001217', 'HPP12_1070')), (('AE000511', 'HP_0661'), ('CP001217', 'HPP12_0674')), (('AE000511', 'HP_0430'), ('CP001217', 'HPP12_0992')), (('AE000511', 'HP_0048'), ('CP001217', 'HPP12_0042')), (('AE000511', 'HP_0860'), ('CP001217', 'HPP12_0860')),
required=True) parser.add_argument('--redo', action='store_true', help='input', default=False) restrictOrgs = ['AE001439', 'AE000511', 'CP001217'] restrictOrgs = None args = parser.parse_args() print("Loading Hom DB") homDB = HomologyDatabase.loadFromFile(args.location.name) print("Loading Genomes") genomDB = GenomeDB(os.path.dirname(args.location.name) + "/genomes", loadAll=False) allorgs = homDB.get_all_organisms() if restrictOrgs: allorgs = restrictOrgs for org in allorgs: genomDB.loadGenome(org) print("Loading HomDB analyser") analyse = HomDBAnalyser(homDB, genomDB, loadAll=False) maxNumberEntries = len(allorgs) maxAllowedDissimWithinCluster = 0.25
from database.genomedb import GenomeDB from database.homDBAnalyser import HomDBAnalyser from database.homologydb import HomologyDatabase if __name__ == '__main__': baseDIR = '/mnt/c/dev/data/haas/homdb/' genomeDB = GenomeDB(baseDIR + "/genomes", loadAll=False) homDB = HomologyDatabase.loadFromFile(baseDIR + "/hpp_comb") analyse = HomDBAnalyser(homDB, genomeDB) def printHOM(homid): print(homid) aligned = analyse.cluster_align('HOMID'+str(homid)) longest = "" allseqs = set() for rec in sorted(aligned._records, key=lambda x: x.id): seq = str(rec.seq).replace('-', '') allseqs.add((seq, rec.id)) if len(seq) > len(longest): longest = str(rec.seq).replace('-', '') print(rec.seq, rec.id) return ('HOMID'+str(homid), longest, set(allseqs))
import sys, os sys.path.insert( 0, str(os.path.dirname(os.path.realpath(__file__))) + "/../../helipyloridb") from database.genomedb import GenomeDB from database.homologydb import HomologyDatabase if __name__ == '__main__': fileLocation = "/mnt/c/dev/data/haas/homdb/" homDB = HomologyDatabase.loadFromFile(fileLocation + "/hpp_split") genomDB = GenomeDB(fileLocation + "/genomes", loadAll=False) allorgs = homDB.get_all_organisms() extra = ['AE001439', 'CP009259'] mc = [ '4_N1-031C1', '2_N1-025A2', '14_1-20A_UB64', '13_N5-004A1', '3_N1-029C1', '11_N4-029C2', '10_N2-085C2', '1_N1-024A1' ] nmc = [ x for x in allorgs if not x in mc and not x in extra and not x.startswith("6_") ] # and not x.startswith("15") print("MC", len(mc), mc) print("NMC", len(nmc), nmc) homlist = []