def __init__(self, blastResults, k=10): out.writeDebug("Initalize Blast Alignment by blasting results ...") # self stuff self.hits = [] # parse the blast results hitPattern = re.compile("<Hit>(.*?)</Hit>", re.DOTALL) hitIdPattern = re.compile("<Hit_def>(.*?)</Hit_def>") hitEValPattern = re.compile("<Hsp_evalue>(.*?)</Hsp_evalue>") hitFromPattern = re.compile("<Hsp_hit-from>(.*?)</Hsp_hit-from>") hitToPattern = re.compile("<Hsp_hit-to>(.*?)</Hsp_hit-to>") # for each hit in the xml i = 0 for hit in hitPattern.finditer(blastResults): i += 1 if i > k: break text = hit.group(0) hit_id = hitIdPattern.search(text).group(1) hit_e_value = hitEValPattern.search(text).group(1) hit_from = hitFromPattern.search(text).group(1) hit_to = hitToPattern.search(text).group(1) self.hits.append( { "hit_id": hit_id, "hit_value": float(hit_e_value), "hit_from": int(hit_from), "hit_to": int(hit_to), "hit_order": False, "method": "blast", } )
def trainprediction(self, data=None, biased=False, maxEpochs = 10000): """Trains the neural network with the provided trainings data and returns true, if the training was successful""" if not data: out.writeDebug('No training data! The net stays initialized with random weights!') return False #create supervised data set from the training nodes ds = SupervisedDataSet(len(self.features), 2) reduced_dataset = [set([]),set([])] for node, target in data: featuresValue = [] for feature in self.features: featuresValue.append(feature(self, node, None, node.querySequence)) if target: reduced_dataset[0].add(tuple(featuresValue+[ACCEPTED, NOTACCEPTED])) else: reduced_dataset[1].add(tuple(featuresValue+[NOTACCEPTED, ACCEPTED])) for posInstance, negInstance in zip(reduced_dataset[0],reduced_dataset[1]): ds.addSample(posInstance[:-2],posInstance[-2:]) ds.addSample(negInstance[:-2],negInstance[-2:]) if biased: ds = SupervisedDataSet(len(self.features), 2) for instance in reduced_dataset[0]: ds.addSample(instance[:-2],instance[-2:]) for instance in reduced_dataset[1]: ds.addSample(instance[:-2],instance[-2:]) out.writeDebug('Start training neural net with %s training examples. Dataset bias is set to %s'%(len(ds), biased )) trainer = BackpropTrainer(self.net, ds) trainer.trainUntilConvergence(maxEpochs = maxEpochs) return True
def localHHBLITS(seq = "NWLGVKRQPLWTLVLILWPVIIFIILAITRTKFPP", database = "../data/PP2db", minEVal = 1): import time out.writeDebug("Do a local hhblits search for {} in {}".format( seq, database ) ) char_set = string.ascii_uppercase + string.digits time_stamp = str(int(time.time())) + ''.join(random.sample(char_set*6, 40)) + str(random.randint(0, 100000000)) seq_file = '' seq = str(seq) if re.match('^[A-Z]*$',seq): seq_file = 'hhblits_input_'+time_stamp fh = open(seq_file,'w') fh.write('>no_header\n'+seq) fh.close() else: seq_file = seq outfile = 'hhblits_'+time_stamp+'.out' command = "hhblits -i {} -o {} -d {} -e {} -n 1".format(seq_file, outfile, database, minEVal) try: hhblitsResults = subprocess.check_output(command, stderr=subprocess.STDOUT, shell = True) hhblitsResults = open(outfile).read() os.remove(outfile) except subprocess.CalledProcessError as err: out.writeLog("Return code for hhblits search {} in {} returned with exit code {}!".format( seq, database, err.returncode ) ) hhblitsResults = '' if seq != seq_file: os.remove(seq_file) return HHBLITS(hhblitsResults)
def __init__(self, hhblitsResults, k = 10): out.writeDebug("Initalize hhblits alignment ...") # self stuff self.hits = [] if hhblitsResults == '#unknown error': return None # parse the hhblits results beginOfResults = False lines = hhblitsResults.split('\n') tmp = [] i = 0 for line in lines: # skip empty lines if not line.strip(): continue # skip empty lines if line.find('Done') != -1: break # skip all lines before the actual searching results begin if beginOfResults: i += 1 if line.startswith('No 1') or i > k: #end of results break items = line.split() hit_id, hit_e_value, query_hit = items[1], items[3], items[9] hit_from, hit_to = query_hit.split('-')[:2] hit_to = hit_to.split('(')[0] self.hits.append({'hit_id':hit_id, 'hit_value': float(hit_e_value), 'hit_from':int(hit_from), 'hit_to': int(hit_to), 'hit_order': False, "method" : "hhblits"}) else: if line.find("No Hit") != -1: beginOfResults = True
def localBlast(seq="NWLGVKRQPLWTLVLILWPVIIFIILAITRTKFPP", database="../data/genes_UniProt.fasta", minEVal=1): out.writeDebug("Do a local blast search for {} in {}".format(seq, database)) blastResults = commands.getstatusoutput( 'echo "{}" | blast2 -p blastp -d {} -N -e {} -m 7'.format(seq, database, minEVal) ) if blastResults[0] != 0: out.writeLog( "Return code for blast search {} in {} returned with exit code {}!".format( seq, database, blastResults[0] ) ) return Blast(blastResults[1])
def learn_parameters(hpoGraph, uni2hpoDict, dataset): out.writeDebug('Start training the predictor.') from predictor import Predictor neuralNet = Predictor(None) # in crosstraining, the test set is the crossTrain and the crossTrain set is the (here ignored) test set crossTrainSet = {'train': dataset['train'], 'crossTrain': dataset['test'], 'test': dataset['crossTrain']} trainingNodes = train_result_set(hpoGraph, uni2hpoDict, crossTrainSet) out.writeDebug('Collected all the nodes for training') if shortcut: neuralNet.trainprediction(trainingNodes, maxEpochs = 10) else: neuralNet.trainprediction(trainingNodes) return neuralNet
def runprediction(self, querySequence, graph): """ this function marks all nodes in the graph as accepted, if the prediction they are predicted to be positive, Note, that node.accpeted should be false for all nodes. This function set the node.accpeted = True, for all nodes that are accepted by the predictor. """ # def acceptNodeAndParentNodes(graph, node): # node.accepted = True # stack = graph.getParents(node) # while len(stack) != 0: # cNode = graph.getHpoTermById(stack.pop()) # cNode.accepted = True # stack.extend(graph.getParents(cNode)) for cNodeID, cNode in graph.hpoTermsDict.iteritems(): # ok, get the node to predict out.writeDebug("Perform prediction for node: {}".format(cNode.id)) # get all features for the current node featuresValue = [] for feature in self.features: featuresValue.append(feature(self, cNode, graph, querySequence)) # ok, now run the neuronal network predictionResult = self.net.activate(featuresValue) out.writeDebug("Prediction result for node {} = {}".format(cNode.id, predictionResult)) # check the prediction result # difference should be between -2 (lowest confidence) and 2 (highest confidence) # ideally, the predictionResult is (1,-1) for accepted, (-1,1) for not accepted confidence = predictionResult[0] - predictionResult[1] out.writeDebug("Prediction result: {}".format(confidence)) # ok, set accepted attribute to confidence cNode.accepted = confidence
def reduce_sequences(): # use set of sequences with reduced sequenc redundancy as basis for validation # set created with CD-HIT at 80% sequence similarity out.writeDebug('Prepare sequence similarity reduced data set from %s'%reducedFile) reduced_sequences = [] for record in SeqIO.parse(open(reducedFile), 'fasta'): reduced_sequences.append((record.id, record.seq)) shuffle(reduced_sequences) # also take care to reserve sequences that are in the same cluster as the test sequences out.writeDebug('Digest sequence clusterings from %s'%clusterFile) sequenceCluster = {} representative = '' sequences = set([]) for line in open(clusterFile): if not line.strip(): continue if line.startswith('>'): #new cluster: if representative: sequenceCluster[representative] = sequences representative = '' sequences = set([]) else: sequence = line.split('>')[1].split('.')[0] sequences.add(sequence) if '*' in line: # representative sequences have a star representative = sequence sequenceCluster[representative] = sequences return (reduced_sequences,sequenceCluster)
def cross_validate(sequences, folds = 10): import hpoParser, time starttime = time.time() dataset_size = len(sequences) allPredictions = [] # init the hpoParser hpoGraph = hpoParser.HpoGraph(hpoFile) # init the hpo-identifier dict out.writeDebug('Initialize dictionary with true annotations from %s'%hpoMappingFile) uni2hpoDict = {} f = open(hpoMappingFile) for line in f: line = line.strip() uni2hpoDict.update( { line.split("\t")[0] : line.split("\t")[1].split(",") } ) f.close() # create folds for i in range(folds): now = (time.time() - starttime)/60 minutes = int(now%60) hours = int(now/60) out.writeDebug('Start with fold %s from %s'%(i+1, folds)) out.writeDebug('Time elapsed: %s:%s'%(hours, minutes)) # test fold test = sequences[i:dataset_size:folds] # fold to learn parameters crossTrain = sequences[(i+1)%folds:dataset_size:folds] # fold to train on, does not include the redundant sequences here # is not really necessary, since train and crosstrain are preserved train = [] for j in range(folds): if j != i and j != (i+1)%folds: train = train + sequences[j:dataset_size:folds] dataset = {'train': train, 'crossTrain': crossTrain, 'test': test} # learn the parameters, however they will look # parameters should be neural net to recognize valid annotation predictor = learn_parameters(hpoGraph, uni2hpoDict, dataset) predictor.saveNeuronalNetwork('neuronalNetwork_Fold%s'%i) # test the parameters on the independent test fold allPredictions.append(predict_set(hpoGraph, uni2hpoDict, dataset, predictor)) predictions = allPredictions[-1] print '***fold %s (FN = %s):***'%((i+1), predictions[1]) for predictedSequence, predictedTerms in predictions[0]: for predictedNode in predictedTerms: print predictedSequence, predictedNode.id, predictedNode.accepted, predictedNode.TruePrediction if shortcut: break
def train_result_Sequence(hpoGraph, uni2hpoDict, dataset, name='', seq=''): out.writeDebug('Get training data for sequence name %s with sequence: %s'%(name, seq)) import blast, hhblits # similar sequence blastResults = blast.Blast.localBlast(seq=seq, database=blastDbFile) hhblitsResults = hhblits.HHBLITS.localHHBLITS(seq=str(seq), database=hhblitsDbFile) # now get the hpo-Identifiers for each similar sequence for hit in blastResults.hits: hit.update( { "hpoTerms" : uni2hpoDict[ hit[ "hit_id" ] ] } ) for hit in hhblitsResults.hits: hit.update( { "hpoTerms" : uni2hpoDict[ hit[ "hit_id" ] ] } ) # set of hits to ignore to avoid information leakage reserved = set([]) reserved.add(name) # add the sequences in the associated clusters #for representative, sequence in dataset['crossTrain']: # reserved = reserved | sequenceCluster[representative] for representative, sequence in dataset['test']: reserved = reserved | sequenceCluster[representative] # build and merge trees graph, hit_id = None, 0 for hit in blastResults.hits: #take only hits from the training set, ignore hits from test or crosstrain set if hit['hit_id'] in reserved: out.writeDebug('Skip hit %s in database that is in the test data'%(hit['hit_id'])) continue subtree = hpoGraph.getHpoSubGraph( hit[ 'hpoTerms' ], { hit_id : hit } ) hit_id += 1 if graph == None: graph = subtree else: graph += subtree for hit in hhblitsResults.hits: if hit['hit_id'] in reserved: out.writeDebug('Skip hit %s in database that is in the test data'%(hit['hit_id'])) continue subtree = hpoGraph.getHpoSubGraph( hit[ 'hpoTerms' ], { hit_id : hit } ) hit_id += 1 if graph == None: graph = subtree else: graph += subtree # get training nodes trainingNodes = [] if graph != None: for node in graph.hpoTermsDict: if node == 'HP:0000001': continue ValidPrediction = False if node in uni2hpoDict[name]: ValidPrediction = True graph.hpoTermsDict[node].querySequence = seq # copy node attributes for training trainingNodes.append((graph.hpoTermsDict[node].copy(), ValidPrediction)) hpoGraph.clearAttr() # return the set of trainings nodes with target variable return trainingNodes
def __init__(self, hpoFile="../data/hp.obo"): """ initalize an hpo graph by an hpo file """ # debug message if hpoFile != None: out.writeDebug( "parsing hpo file " + str( hpoFile ) ) # init main class varibale self.hpoTermsDict = {} self.isSubTree = hpoFile == None # if the file to parse is None, an empty HpoGraph will be returned if hpoFile == None: return # helper function to analyse the lines def _analyseLines(self, lines): """ Analyse the parsed lines (helper function) """ # file descriptor or hp term? if lines[0].startswith( "[Term]" ): # add a hpoterm by the hpoterms description for line in lines: # do nothing, if HpoTerm is_obsolete if line.startswith('is_obsolete:'): return term = HpoTerm( lines[1:] ) self.hpoTermsDict.update( { term.id.split(" ")[0] : term } ) else: for line in lines: # ok, get the position of the : attrName = line[:line.find(":")].strip() attrVal = line[line.find(":")+1:].strip() # now add this as attribute if hasattr(self, attrName): if isinstance(getattr(self, attrName), list): getattr(self, attrName).append( attrVal ) else: setattr(self, attrName, [ getattr(self, attrName), attrVal ]) else: setattr(self, attrName, attrVal) # ok, parse the lines in the file try: f = file( hpoFile, "r" ) lines = [] for line in f: # skip empty lines if line.strip() == "": continue # do something for non empty lines if line.startswith( "[Term]" ): _analyseLines(self, lines) lines = [ line ] else: lines.append(line) _analyseLines(self, lines) f.close() except Exception as e: out.writeError("Error parsing hpo file " + str( e.message ) + " " + str( e.args) ) # good and now create the relation ship childrens for key in self.hpoTermsDict: node = self.hpoTermsDict[key] if hasattr(node, "is_a"): if isinstance(node.is_a, list): for element in node.is_a: self.hpoTermsDict[ element.split(" ")[0] ].childrens.append(key) else: self.hpoTermsDict[ node.is_a.split(" ")[0] ].childrens.append(key)
def predictSequence(args, hpoGraph, uni2hpoDict, name="Sequence", seq=""): # ok, do the whole thing try: # debug msg out.writeLog('Predict function for protein: id: "' + str(name) + '" sequence: "' + str(seq) + '"') # lookup resulst if available foundInLookUp, hits = False, [] if args.lookupdb: out.writeLog("Checking for precalculated results!") # ok, load them f = open(args.lookupdb, "r") for line in f: if line.strip() == name.strip(): # oh, cool, its precalculated foundInLookUp = True elif foundInLookUp and line.startswith("\t"): # ok, this belongs to result, load it m = re.search("\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)\t([^\t]*)", line) hits.append( { "method": m.group(1), "hit_id": m.group(2), "hit_value": float(m.group(3)), "hit_from": int(m.group(4)), "hit_to": int(m.group(5)), "hit_order": bool(m.group(6)), } ) elif foundInLookUp: break f.close() # ok, first of all, get similar sequences! if not foundInLookUp: out.writeLog("Check blast and hhblits for sequence orthologs!") blastResults = blast.Blast.localBlast(seq=seq, database=args.blastDbFile, minEVal=args.blastMinEVal) for hit in blastResults.hits: out.writeDebug("Blast: found hit: " + str(hit)) hhblitsResults = hhblits.HHBLITS.localHHBLITS(seq=str(seq), database=args.hhblitsDbFile) for hit in hhblitsResults.hits: out.writeDebug("hhblits: found hit: " + str(hit)) hits.extend(blastResults.hits) hits.extend(hhblitsResults.hits) # reduce hits if fast prediction if args.fast: out.writeLog("Reduce hits for faster prediction!") hitsTmp = sorted(hits, key=lambda t: t["hit_value"]) hits = hitsTmp[:6] # now get the hpo-Identifiers for each similar sequence out.writeLog("uniprot ids ({}) 2 HPO Terms".format(len(hits))) for hit in hits: try: # Do not output this, it might be some GB output # out.writeDebug("found hpoTerms for " + str( hit[ "hit_id" ] ) + ": " + str( uni2hpoDict[ hit[ "hit_id" ] ] ) ) hit.update({"hpoTerms": uni2hpoDict[hit["hit_id"]]}) except KeyError: out.writeWarning("MISSING HPO TERMS FOR HIT: " + str(hit)) # build and merge trees out.writeLog("Build and merge tree for similar sequences!") graph, hit_id = hpoGraph.getHpoSubGraph(hpoGraph.getRoot()), 0 for hit in hits: # out.writeDebug("@blast merging: {}".format(hit)) subtree = hpoGraph.getHpoSubGraph(hit["hpoTerms"], {hit_id: hit}) hit_id += 1 graph += subtree # do the prediciton out.writeLog("Run main prediction!") # init the predictor p = predictor.Predictor(args.neuronalNet) p.runprediction(seq, graph) # always accept the root for root in hpoGraph.getRoot(): graph.getHpoTermById(root).accepted = 1 # do the output out.writeLog("writing output") for node in graph.getAcceptedNodes(args.minimalConfidence): out.writeOutput("{}\t{}\t{}".format(name, node.id, "%.*f" % (2, (node.accepted + 2) / 4))) # svg image desired? if args.createSvgImage: out.writeLog("Create a svg image showing all results!") if graph != None: graph.writeSvgImage(fileName=str(name) + ".svg") else: out.writeWarning("Can't create a svg image from an empty tree!") # clear attrs from all tree nodes, so that these don't interfere with later predictions # out.writeLog("Clear memory for next prediction") # hpoGraph.clearAttr() except Exception as err: exc_type, exc_obj, exc_tb = sys.exc_info() out.writeError("Predicting Error: " + str(err) + " on line: " + str(exc_tb.tb_lineno)) exit(1) pass