def __init__(self): self.blastTools = BlastTools() self.verbose = False
class ProcessVCF: ''' @summary: Tools set for dealing with variant call format (vcf) Files. Specifically this tools set does the following 1. Find expanding regions of variants with a designated distance across all samples 2. Combine variants in each sample into a report object to be used for machine learning analysis of genotype to phenotype. 3. Create a distance matrix using given variants between given strains. ''' def __init__(self): self.blastTools = BlastTools() self.verbose = False def writeToLog(self,stringValue,logFileH,verbose=False): if logFileH == None: return None try: if verbose: print stringValue logFileH.write(stringValue+"\n") except: print "failed to write to log file [%s]" % (logFileH) return None def replaceSeqTarget(self,seq,newSeq,loc): prefix = seq[:loc] post = seq[loc+len(newSeq):] result = prefix + newSeq + post return result def findTargets(self,targetMap,feature,minQuality=10,logFile=None): ''' @var @summary: Find variant calls in proximity to given feature. ''' start = feature.location.start.position end = feature.location.end.position locations = targetMap.keys() matches = filter(lambda x: start<=float(x)<=end,locations) querySeq = feature.qualifiers["query"] subjectSeq = feature.qualifiers["subject"] refSeq = Seq("_"*len(subjectSeq)) readSeq = Seq("_"*len(subjectSeq)) qualityValues = [] chrom = '' dCount = 0 for loc in matches: floc = float(loc) seqStart = int(floc-start) target = targetMap[loc] chrom = target["CHROM"] ref = target["REF"] alts = target["ALT"].split(",") quality = target["QUAL"] qualityValues.append((seqStart,quality)) if float(quality) < minQuality: print "(failed) Feature [%s] (%s <-> %s) ===> %s" % (feature.id,start,end,qualityValues) else: dCount += 1 refSeq = self.replaceSeqTarget(refSeq,ref,seqStart) for alt in alts: readSeq = self.replaceSeqTarget(readSeq,alt,seqStart) result = {} result["name"] = feature.id result["chrom"] = chrom result["start"] = start result["end"] = end result["quality"] = qualityValues result["query"] = querySeq result["subject"] = subjectSeq result["refSeq"] = refSeq result["alt-read"] = readSeq return result def annotateAlignment(self,targetMap,featuresArray,idTag): readLogName = "read_log_%s.txt" % (idTag) logFP = open(readLogName,"w") result = {} for features in featuresArray: for feature in features: id = feature.id targets = self.findTargets(targetMap, feature, minQuality= 10, logFile = logFP) result[id] = targets return result def alignVCF(self,targetFile,vcfFile,idTag): ''' @return: ReportObect @summary: Check alignment to annotated Genomic sequence using BLAST. ''' readRecord = parseVCFFile(vcfFile) targetRecords= SeqIO.parse(open(targetFile), "fasta") targetRecords = list(targetRecords) print "Processing [%s] records" % (len(targetRecords)) if verbose: print "blasting sequences" self.blastTools.verbose = verbose blastedFeatures = self.blastTools.seqBlastToFeatures(blastDB, blastExe, targetFile, blastType = "blastn",scoreMin = 1e-5) if verbose: print "finished blasting locations" alignmentReport = self.annotateAlignment(readRecord, blastedFeatures,idTag) return alignmentReport