def checkingSons(peaks): """ bring back both, actually works, may be used only for the display and not inside the data structure """ toRemove = MSPeakList() def recursiveMerging(root): if not root in toRemove: for frag in root.fragCluster: if frag.fragCluster: toRemove.addPeak(frag) for son in frag.fragCluster: if son != root: root.fragCluster.append(son) #toRemove.addPeak(son)#dont know if it is really necessary #else: # print "must do other not yet implemented" root.fragCluster = MSPeakList(list(set(root.fragCluster))) recursiveMerging(frag) #before was just one step ahead return return for peak in peaks: recursiveMerging(peak) lastPeakList = MSPeakList() for p in peaks: if p not in toRemove: lastPeakList.append(p) return sorted(lastPeakList, key=lambda x: x.mass())
def checkingSons(peaks): """ bring back both, actually works, may be used only for the display and not inside the data structure """ toRemove=MSPeakList() def recursiveMerging(root): if not root in toRemove: for frag in root.fragCluster: if frag.fragCluster: toRemove.addPeak(frag) for son in frag.fragCluster: if son != root: root.fragCluster.append(son) #toRemove.addPeak(son)#dont know if it is really necessary #else: # print "must do other not yet implemented" root.fragCluster=MSPeakList(list(set(root.fragCluster))) recursiveMerging(frag)#before was just one step ahead return return for peak in peaks: recursiveMerging(peak) lastPeakList=MSPeakList() for p in peaks: if p not in toRemove: lastPeakList.append(p) return sorted(lastPeakList, key=lambda x:x.mass())
def covMatrix(lspl, **k): """ idea use the leastsq method to calculate the covariance matrix """ #depacking the arguments rtError = k.get('rtError') ppm = k.get('ppm') matrix = [] for sple in lspl: treatedPeaks = MSPeakList() for peak in sple.peaks: if peak not in treatedPeaks: peaks = MSPeakList() rValues = [] rValues.append((peak.mass(), peak.rt, sple.xmlfile)) peaks.append(peak) for s in lspl: if s != spl: common = s.peaks.peakInMZRTRange( peak.mass(), peak.rt, ppm, rtError) if len(common) > 1: #do something eliminate some peaks to get the closest in rt and mz treatedPeaks.extend(common) peaks.extend(common) if not common: pass #treat peak for i in range(len(peaks)): corr = 0. isosA, fragsA = peaks[i].isoAreaList( includeM0=True), peaks[i].fragAreaList() for j in range(i + 1, len(peaks)): #mean of the frags and the iso #idea ponderate with the length more length more weighted isosB, fragsB = peaks[j].isoAreaList( includeM0=True), peaks[j].fragAreaList() corr += r_coef(isosA, isosB) corr += r_coef(fragsA, fragsB) rValues.append(corr / 2.) matrix.append(rValues) return matrix
def covMatrix(lspl, **k): """ idea use the leastsq method to calculate the covariance matrix """ #depacking the arguments rtError = k.get('rtError') ppm = k.get('ppm') matrix = [] for sple in lspl: treatedPeaks=MSPeakList() for peak in sple.peaks: if peak not in treatedPeaks: peaks=MSPeakList() rValues =[] rValues.append((peak.mass(), peak.rt, sple.xmlfile)) peaks.append(peak) for s in lspl: if s != spl: common = s.peaks.peakInMZRTRange(peak.mass(), peak.rt, ppm, rtError) if len(common) > 1: #do something eliminate some peaks to get the closest in rt and mz treatedPeaks.extend(common) peaks.extend(common) if not common: pass #treat peak for i in range(len(peaks)): corr =0. isosA, fragsA = peaks[i].isoAreaList(includeM0=True), peaks[i].fragAreaList() for j in range(i+1, len(peaks)): #mean of the frags and the iso #idea ponderate with the length more length more weighted isosB, fragsB = peaks[j].isoAreaList(includeM0=True), peaks[j].fragAreaList() corr+=r_coef(isosA, isosB) corr+=r_coef(fragsA, fragsB) rValues.append(corr/2.) matrix.append(rValues) return matrix
def clusterComparison(list_):#receive a list of peak with clusters identified """ return the best peak WARNING: p_ydata and p_.y_data are None TODO: """ sortedList = sorted(list_, key=lambda x: len(x.fragCluster)) longest=len(sortedList[-1].fragCluster) sameSizePeaks=MSPeakList() for p in sortedList: if len(p.fragCluster) == longest: sameSizePeaks.append(p) if len(sameSizePeaks) == 1: return sameSizePeaks[0] corr=np.array([0.] * len(sameSizePeaks)) #for i, p in enumerate(sameSizePeaks): # for p_ in p.fragCluster: # corr[i] += r_coef(p_.y_data, p.y_data) m=max_f(corr) return sameSizePeaks[np.where(corr == m)[0][0]]
def clusterComparison( list_): #receive a list of peak with clusters identified """ return the best peak WARNING: p_ydata and p_.y_data are None TODO: """ sortedList = sorted(list_, key=lambda x: len(x.fragCluster)) longest = len(sortedList[-1].fragCluster) sameSizePeaks = MSPeakList() for p in sortedList: if len(p.fragCluster) == longest: sameSizePeaks.append(p) if len(sameSizePeaks) == 1: return sameSizePeaks[0] corr = np.array([0.] * len(sameSizePeaks)) #for i, p in enumerate(sameSizePeaks): # for p_ in p.fragCluster: # corr[i] += r_coef(p_.y_data, p.y_data) m = max_f(corr) return sameSizePeaks[np.where(corr == m)[0][0]]
def clusteringBASIC(peaks, adds, **k): if not peaks: return t=time.clock() errorRt = k.get('rtError', 6) #ppm = float(kwargs.get('ppm'))/10**6 ppm = k.get('ppm') if ppm is None: try: ppm = peaks[0].sample.ppm/1e6 except AttributeError: print "No value found for ppm setting to 10/1E6" ppm = 10./1e6 #mode = k.get('mode', 'HighRes') resolveConflicts=k.get('resolveConflicts', False) addsToCheck=np.array(adds.keys()) adductsFound = MSPeakList() for i, p in enumerate(peaks): a = MSClusterList() for v in addsToCheck: m = p.mz+v[0] match = peaks.peaksInMZRTRange(m, p.rt, errorRt, deltam= 2 * ppm * m) if match is None or not match: continue #take the closest in mass goodP = sorted(match, key=lambda x:abs(x.mz - (p.mz + v[0])))[0] #if goodP in set(adductsFound): # if resolveConflicts: # pass #else: if goodP is p: continue a.append(goodP) goodP.parentPeak=p adductsFound.append(goodP) p.fragCluster=MSPeakList(set(a))#prevent from duplicates # def clusterComparison(list_):#receive a list of peak with clusters identified # """ # return the best peak # WARNING: p_ydata and p_.y_data are None # TODO: # # """ # sortedList = sorted(list_, key=lambda x: len(x.fragCluster)) # longest=len(sortedList[-1].fragCluster) # sameSizePeaks=MSPeakList() # # for p in sortedList: # if len(p.fragCluster) == longest: # sameSizePeaks.append(p) # # if len(sameSizePeaks) == 1: # return sameSizePeaks[0] # corr=np.array([0.] * len(sameSizePeaks)) # #for i, p in enumerate(sameSizePeaks): # # for p_ in p.fragCluster: # # corr[i] += r_coef(p_.y_data, p.y_data) # m=max_f(corr) # return sameSizePeaks[np.where(corr == m)[0][0]] # # if resolveConflicts: # for add in set(adductsFound): # if len(add.parentPeak) <= 1: # #print "%s belong to several fragCluster"%str(add) # continue # #print "%s belong to several fragCluster"%str(add) # goodParent=clusterComparison(add.parentPeak) # #if goodParent is not None: # # add.parentPeak = [goodParent] # # for parent in add.parentPeak: # if parent != goodParent: # try: # parent.fragCluster.remove(add) # except ValueError: # print "Error removing %s from fragCluster of %s"%(str(add), str(parent)) # add.parentPeak = [goodParent] #the same of constructing a list 'toRemove then remove # #print "after removing len add.parentPeak", len(add.parentPeak) print "TiemElapsed: %s"%str(time.clock()-t) return peaks, adductsFound
def clusteringCAMERA(peaks, adducts, **kwargs): """ arguments needed: error_rt:rt_ drift ppm:precision useCorrelation: if we calculate correlations """ t=time.clock() #unpack parameters error_rt = kwargs.get('rtError', 6) #ppm = float(kwargs.get('ppm'))/10**6 ppm=peaks[0].sample.ppm/1e6 mode=kwargs.get('mode', 'HighRes') resolveConflicts=kwargs.get('resolveConflicts', False) peaks_with_iso=peaks print "peaklist length",len(peaks) adducts_to_check=np.array(adducts.keys()) #=========================================================================== #START CAMERA ALGORITHM print ("RT Grouping ...") #RT_peak=peaks_with_iso.rtClustering(error_rt) #3,find for each peak peaks which matches with retention time rtPeak =[] for i, peak in enumerate(peaks_with_iso.ipeaks()): l=MSPeakList() l.addPeak(peak) for j, peak_ in enumerate(peaks_with_iso.ipeaks()): if i!=j: if abs(peak.rt - peak_.rt) < error_rt: l.append(peak_) isIncluded=False index=[] for k, rtClust in enumerate(rtPeak): if set(l)<=(set(rtClust)):#inclusion test of l already in rt ? seen as 'equivalent to' isIncluded=True break if set(rtClust) <= (set(l)): index.append(k) #break #del rtPeak[index] rtPeak= [rtPeak[i] for i in xrange(len(rtPeak)) if i not in index] if not isIncluded: rtPeak.append(MSPeakList(l)) #isIncluded=True #else: # if rtClust.__eq__(l): # rtPeak[k]=l # break #isIncluded=True #if not isIncluded: #l.sort(key=lambda x:x.mass()) # with open('test1.txt', 'w') as f: # for r in rtPeak: # s="" # for i, p in enumerate(r): # s+=str(p)+';' if i<len(r)-1 else str(p)+'\n' # f.write(s) #EXPERIMENTAL CODE # cl=[] # for cluster in rtPeak: # list_=[];datapoints={} # for i, p in enumerate(cluster): # correspondingPeaks=set() # correspondingPeaks.add(p) # for j in xrange(i+1, len(cluster)): # #put caching on that to avoid recalculation each time of the datapoints # try: # r=r_coef(list(datapoints[p]), list(datapoints[cluster[j]])) # except KeyError: # y, y_= None, None # try: # y=datapoints[p] # except KeyError: # x, y= massExtractionBisectAlgo(p.sample,p.mass(), ppm) # datapoints[p]=y # # try: # y_=datapoints[cluster[j]] # except KeyError: # x, y_= massExtractionBisectAlgo(cluster[j].sample, cluster[j].mass(), ppm) # datapoints[cluster[j]]=y_ # r=r_coef(y, y_) # if r >= threshold: # correspondingPeaks.add(cluster[j]) # list_.append(correspondingPeaks) # # for i, p in enumerate(list_): # for j in xrange(i+1, len(list_)): # if list_[j].issubset(p): # continue # else: # cl.append(MSPeakList(list(p))) #merging step again # print "cluster length, same without replicates",len(cl), len(set(map(set, [x for x in cl]))) # with open('test2.txt', 'w') as f: # for r in cl: # s="" # for i, p in enumerate(r): # s+=str(p)+';' if i<len(r)-1 else str(p)+'\n' # f.write(s) # #END EXPERIMENTAL CODE print 'len RTpeak', len(rtPeak) print ("Creating possible M0...") #Cython code finalList = massGenPerGroup(rtPeak, adducts_to_check, ppm) print("Mapping of calculated mass on peaklist...") #4,see if one matches with peak in the raw peaklist goodPeak=[]#list will contain good peak per rtCluster for i, dic in enumerate(finalList): matchingMass=defaultdict(list) for mass in dic.iterkeys(): p = rtPeak[i].peaksInMZRange(mass, deltam=mass * ppm if mode=='HighRes' else 1.)#rtPeak[i] not necessarily sorted warning if not p: continue peak=sorted(p, key=lambda x:abs(mass - x.mass()))[0] #if peak not in matchingMass.keys():#may avoid this to see if one peak appears several times !then do 'set' # matchingMass[peak]=[] matchingMass[peak] += dic[mass] goodPeak.append(matchingMass) #start new stuffs here print ("Merging informations...") #conflicts=False adds=MSPeakList()#object sor storing adducts found newGoodPeaks=defaultdict(list)#{} for peaksInOneRtGroup in goodPeak: for peak in peaksInOneRtGroup.iterkeys(): newGoodPeaks[peak] += peaksInOneRtGroup[peak] for p in newGoodPeaks.iterkeys(): p.fragCluster=MSClusterList(list(set(newGoodPeaks[p]))) for f in p.fragCluster: f.parentPeak.append(p) adds += p.fragCluster finalPeaks=MSPeakList(newGoodPeaks.keys()) print ("Resolving conflicts if any...") #removing peak that appears many times that is to say in different clusters def clusterComparison(list_):#receive a list of peak with clusters identified """ return the best peak WARNING: p_ydata and p_.y_data are None TODO: """ sortedList = sorted(list_, key=lambda x: len(x.fragCluster)) longest=len(sortedList[-1].fragCluster) sameSizePeaks=MSPeakList() for p in sortedList: if len(p.fragCluster) == longest: sameSizePeaks.append(p) if len(sameSizePeaks) == 1: return sameSizePeaks[0] corr=np.array([0.] * len(sameSizePeaks)) #for i, p in enumerate(sameSizePeaks): # for p_ in p.fragCluster: # corr[i] += r_coef(p_.y_data, p.y_data) m=max_f(corr) return sameSizePeaks[np.where(corr == m)[0][0]] if resolveConflicts: for add in set(adds): if len(add.parentPeak) <= 1: #print "%s belong to several fragCluster"%str(add) continue #print "%s belong to several fragCluster"%str(add) goodParent=clusterComparison(add.parentPeak) #if goodParent is not None: # add.parentPeak = [goodParent] for parent in add.parentPeak: if parent != goodParent: try: parent.fragCluster.remove(add) except ValueError: print "Error removing %s from fragCluster of %s"%(str(add), str(parent)) add.parentPeak = [goodParent] #the same of constructing a list 'toRemove then remove #print "after removing len add.parentPeak", len(add.parentPeak) #make the annotation for peak in finalPeaks.ipeaks(): for f in peak.fragCluster: #results = makeAnnotations(adducts_to_check, adducts, f.mass(), ppm) for annot in adducts.iterkeys(): p = f.mass() / annot[1] + annot[0] diff = peak.mass()*ppm if mode =='HighRes' else 1 if peak.mass() > p-diff and peak.mass() < p+diff: f.annotation[annot]=adducts[annot] break finalPeaks=checkingSons(finalPeaks) #5,second filter, correlation on the isotopic cluster between samples # if useCorrelation: # print "Calculating correlation between samples..." # interSamplesCorr(spl, **kwargs) # print "Calculating correlation intra sample..." # intraSampleCorr(spl) # #6 merging print "Merging interesting peaks" for peak in peaks_with_iso.ipeaks():#wring merging must take out those which allow to construct this peak if peak not in finalPeaks and peak not in adds:#matching_peaks: finalPeaks.append(peak) #matching_peaks to if not finalPeaks: print ("no cluster found, please increase the ppm, or rt drift parameters") print ("finished, time elapsed:",time.clock()-t) return MSPeakList(sorted(finalPeaks, key=lambda x:x.mass)), adds#checkingSons(finalPeaks), adds
def isotopicPeakListFinder(peaks, isomasses, **kwargs): """ assign an isotopic cluster for each peak, and try to find an idms we may use a system like the CAMERA algorithm to see... input: list of peak must an obj.MSPeakList object clusterLength = 6 never go to six in LOW_RES size expected of an isotopic cluster rtError: maximum drift of the retention time decreaseOrder: allow or not allow that the successive peak of the isotopic cluster intensity are going down, can be confusing for finding idms output: two MSPeakList, the first one corresponding to the peaks with an isotopic cluster and the other one all peaks belonging to an isotopic cluster """ #unpacking parameters print "Isotopic cluster calculation..." rtError = np.float(kwargs.get('rtError', 6)) ppm=np.float(peaks[0].sample.ppm/1e6) MAX_GAP_ALLOWED = np.int(len(isomasses)) decreaseOrder = kwargs.get('decreaseOrder', True) #we use the less restrictive... mode = kwargs.get('mode', 'Highres') #sort isomasses #isomasses = sorted(isomasses, key=lambda x:x[0]) peaks_with_iso =MSPeakList() peaks_without_iso = MSPeakList()#peaks without isotopic cluster but which does not have a isotopic cluster list_iso = set()#MSPeakList() t = time.clock() for peak in peaks.ipeaks():#iterating over peaks if peak in list_iso: continue#avoid to calculate for every peaks isoCluster= MSClusterList() gap = 0 #isos = resolutionAdjustment(isomasses, peak.mass()*ppm) if mode=='HighRes' else isomasses for i, isomass in enumerate(sorted(isomasses, key=lambda x:x[0])): #pic = _getMatchingPeaks(peaks, peak, isomass[0], ppm, rtError) mass=isomass[0] massToCheck=peak.mass()+mass p = peaks.peaksInMZRange(massToCheck, deltam=ppm*massToCheck if mode=='HighRes' else 1.) #deltart matchingRtPeaks = MSPeakList()#will contain all matching peak in rt for pk in p.ipeaks(): if pk != peak: if abs(peak.rt - pk.rt) <= rtError: matchingRtPeaks.append(pk) if matchingRtPeaks: pic = sorted(matchingRtPeaks, key=lambda pics: abs(pics.mass()-peak.mass()))[0] #take the closest in mass if pic is not None: if decreaseOrder:#we want peak area inferior a peak #if isoCluster: areaToCompare=isoCluster[-1].area if isoCluster else peak.area if areaToCompare < pic.area:#idms found ??? break if pic not in list_iso:#pic not in isoCluster and isoCluster.append(pic) list_iso.add(pic) else: gap+=1 if gap >=MAX_GAP_ALLOWED: break # #set parent for all peaks found if isoCluster: for pics in isoCluster: #pics.parentPeak=peak pics.parentPeak.append(peak) peak.isoCluster = isoCluster peaks_with_iso.addPeak(peak) else: peaks_without_iso.addPeak(peak) # for p in peaks.ipeaks(): # if p not in peaks_with_iso and p not in list_iso: # peaks_without_iso.addPeak(p) print time.clock()-t print "peaks with isotopes: " ,len(peaks_with_iso) print "list isotopes: " ,len(list_iso) print "peaks without isotopes: " ,len(peaks_without_iso) return peaks_with_iso+peaks_without_iso, list_iso
def clusteringBASIC(peaks, adds, **k): if not peaks: return t = time.clock() errorRt = k.get('rtError', 6) #ppm = float(kwargs.get('ppm'))/10**6 ppm = k.get('ppm') if ppm is None: try: ppm = peaks[0].sample.ppm / 1e6 except AttributeError: print "No value found for ppm setting to 10/1E6" ppm = 10. / 1e6 #mode = k.get('mode', 'HighRes') resolveConflicts = k.get('resolveConflicts', False) addsToCheck = np.array(adds.keys()) adductsFound = MSPeakList() for i, p in enumerate(peaks): a = MSClusterList() for v in addsToCheck: m = p.mz + v[0] match = peaks.peaksInMZRTRange(m, p.rt, errorRt, deltam=2 * ppm * m) if match is None or not match: continue #take the closest in mass goodP = sorted(match, key=lambda x: abs(x.mz - (p.mz + v[0])))[0] #if goodP in set(adductsFound): # if resolveConflicts: # pass #else: if goodP is p: continue a.append(goodP) goodP.parentPeak = p adductsFound.append(goodP) p.fragCluster = MSPeakList(set(a)) #prevent from duplicates # def clusterComparison(list_):#receive a list of peak with clusters identified # """ # return the best peak # WARNING: p_ydata and p_.y_data are None # TODO: # # """ # sortedList = sorted(list_, key=lambda x: len(x.fragCluster)) # longest=len(sortedList[-1].fragCluster) # sameSizePeaks=MSPeakList() # # for p in sortedList: # if len(p.fragCluster) == longest: # sameSizePeaks.append(p) # # if len(sameSizePeaks) == 1: # return sameSizePeaks[0] # corr=np.array([0.] * len(sameSizePeaks)) # #for i, p in enumerate(sameSizePeaks): # # for p_ in p.fragCluster: # # corr[i] += r_coef(p_.y_data, p.y_data) # m=max_f(corr) # return sameSizePeaks[np.where(corr == m)[0][0]] # # if resolveConflicts: # for add in set(adductsFound): # if len(add.parentPeak) <= 1: # #print "%s belong to several fragCluster"%str(add) # continue # #print "%s belong to several fragCluster"%str(add) # goodParent=clusterComparison(add.parentPeak) # #if goodParent is not None: # # add.parentPeak = [goodParent] # # for parent in add.parentPeak: # if parent != goodParent: # try: # parent.fragCluster.remove(add) # except ValueError: # print "Error removing %s from fragCluster of %s"%(str(add), str(parent)) # add.parentPeak = [goodParent] #the same of constructing a list 'toRemove then remove # #print "after removing len add.parentPeak", len(add.parentPeak) print "TiemElapsed: %s" % str(time.clock() - t) return peaks, adductsFound
def clusteringCAMERA(peaks, adducts, **kwargs): """ arguments needed: error_rt:rt_ drift ppm:precision useCorrelation: if we calculate correlations """ t = time.clock() #unpack parameters error_rt = kwargs.get('rtError', 6) #ppm = float(kwargs.get('ppm'))/10**6 ppm = peaks[0].sample.ppm / 1e6 mode = kwargs.get('mode', 'HighRes') resolveConflicts = kwargs.get('resolveConflicts', False) peaks_with_iso = peaks print "peaklist length", len(peaks) adducts_to_check = np.array(adducts.keys()) #=========================================================================== #START CAMERA ALGORITHM print("RT Grouping ...") #RT_peak=peaks_with_iso.rtClustering(error_rt) #3,find for each peak peaks which matches with retention time rtPeak = [] for i, peak in enumerate(peaks_with_iso.ipeaks()): l = MSPeakList() l.addPeak(peak) for j, peak_ in enumerate(peaks_with_iso.ipeaks()): if i != j: if abs(peak.rt - peak_.rt) < error_rt: l.append(peak_) isIncluded = False index = [] for k, rtClust in enumerate(rtPeak): if set(l) <= ( set(rtClust) ): #inclusion test of l already in rt ? seen as 'equivalent to' isIncluded = True break if set(rtClust) <= (set(l)): index.append(k) #break #del rtPeak[index] rtPeak = [rtPeak[i] for i in xrange(len(rtPeak)) if i not in index] if not isIncluded: rtPeak.append(MSPeakList(l)) #isIncluded=True #else: # if rtClust.__eq__(l): # rtPeak[k]=l # break #isIncluded=True #if not isIncluded: #l.sort(key=lambda x:x.mass()) # with open('test1.txt', 'w') as f: # for r in rtPeak: # s="" # for i, p in enumerate(r): # s+=str(p)+';' if i<len(r)-1 else str(p)+'\n' # f.write(s) #EXPERIMENTAL CODE # cl=[] # for cluster in rtPeak: # list_=[];datapoints={} # for i, p in enumerate(cluster): # correspondingPeaks=set() # correspondingPeaks.add(p) # for j in xrange(i+1, len(cluster)): # #put caching on that to avoid recalculation each time of the datapoints # try: # r=r_coef(list(datapoints[p]), list(datapoints[cluster[j]])) # except KeyError: # y, y_= None, None # try: # y=datapoints[p] # except KeyError: # x, y= massExtractionBisectAlgo(p.sample,p.mass(), ppm) # datapoints[p]=y # # try: # y_=datapoints[cluster[j]] # except KeyError: # x, y_= massExtractionBisectAlgo(cluster[j].sample, cluster[j].mass(), ppm) # datapoints[cluster[j]]=y_ # r=r_coef(y, y_) # if r >= threshold: # correspondingPeaks.add(cluster[j]) # list_.append(correspondingPeaks) # # for i, p in enumerate(list_): # for j in xrange(i+1, len(list_)): # if list_[j].issubset(p): # continue # else: # cl.append(MSPeakList(list(p))) #merging step again # print "cluster length, same without replicates",len(cl), len(set(map(set, [x for x in cl]))) # with open('test2.txt', 'w') as f: # for r in cl: # s="" # for i, p in enumerate(r): # s+=str(p)+';' if i<len(r)-1 else str(p)+'\n' # f.write(s) # #END EXPERIMENTAL CODE print 'len RTpeak', len(rtPeak) print("Creating possible M0...") #Cython code finalList = massGenPerGroup(rtPeak, adducts_to_check, ppm) print("Mapping of calculated mass on peaklist...") #4,see if one matches with peak in the raw peaklist goodPeak = [] #list will contain good peak per rtCluster for i, dic in enumerate(finalList): matchingMass = defaultdict(list) for mass in dic.iterkeys(): p = rtPeak[i].peaksInMZRange( mass, deltam=mass * ppm if mode == 'HighRes' else 1.) #rtPeak[i] not necessarily sorted warning if not p: continue peak = sorted(p, key=lambda x: abs(mass - x.mass()))[0] #if peak not in matchingMass.keys():#may avoid this to see if one peak appears several times !then do 'set' # matchingMass[peak]=[] matchingMass[peak] += dic[mass] goodPeak.append(matchingMass) #start new stuffs here print("Merging informations...") #conflicts=False adds = MSPeakList() #object sor storing adducts found newGoodPeaks = defaultdict(list) #{} for peaksInOneRtGroup in goodPeak: for peak in peaksInOneRtGroup.iterkeys(): newGoodPeaks[peak] += peaksInOneRtGroup[peak] for p in newGoodPeaks.iterkeys(): p.fragCluster = MSClusterList(list(set(newGoodPeaks[p]))) for f in p.fragCluster: f.parentPeak.append(p) adds += p.fragCluster finalPeaks = MSPeakList(newGoodPeaks.keys()) print("Resolving conflicts if any...") #removing peak that appears many times that is to say in different clusters def clusterComparison( list_): #receive a list of peak with clusters identified """ return the best peak WARNING: p_ydata and p_.y_data are None TODO: """ sortedList = sorted(list_, key=lambda x: len(x.fragCluster)) longest = len(sortedList[-1].fragCluster) sameSizePeaks = MSPeakList() for p in sortedList: if len(p.fragCluster) == longest: sameSizePeaks.append(p) if len(sameSizePeaks) == 1: return sameSizePeaks[0] corr = np.array([0.] * len(sameSizePeaks)) #for i, p in enumerate(sameSizePeaks): # for p_ in p.fragCluster: # corr[i] += r_coef(p_.y_data, p.y_data) m = max_f(corr) return sameSizePeaks[np.where(corr == m)[0][0]] if resolveConflicts: for add in set(adds): if len(add.parentPeak) <= 1: #print "%s belong to several fragCluster"%str(add) continue #print "%s belong to several fragCluster"%str(add) goodParent = clusterComparison(add.parentPeak) #if goodParent is not None: # add.parentPeak = [goodParent] for parent in add.parentPeak: if parent != goodParent: try: parent.fragCluster.remove(add) except ValueError: print "Error removing %s from fragCluster of %s" % ( str(add), str(parent)) add.parentPeak = [ goodParent ] #the same of constructing a list 'toRemove then remove #print "after removing len add.parentPeak", len(add.parentPeak) #make the annotation for peak in finalPeaks.ipeaks(): for f in peak.fragCluster: #results = makeAnnotations(adducts_to_check, adducts, f.mass(), ppm) for annot in adducts.iterkeys(): p = f.mass() / annot[1] + annot[0] diff = peak.mass() * ppm if mode == 'HighRes' else 1 if peak.mass() > p - diff and peak.mass() < p + diff: f.annotation[annot] = adducts[annot] break finalPeaks = checkingSons(finalPeaks) #5,second filter, correlation on the isotopic cluster between samples # if useCorrelation: # print "Calculating correlation between samples..." # interSamplesCorr(spl, **kwargs) # print "Calculating correlation intra sample..." # intraSampleCorr(spl) # #6 merging print "Merging interesting peaks" for peak in peaks_with_iso.ipeaks( ): #wring merging must take out those which allow to construct this peak if peak not in finalPeaks and peak not in adds: #matching_peaks: finalPeaks.append(peak) #matching_peaks to if not finalPeaks: print( "no cluster found, please increase the ppm, or rt drift parameters" ) print("finished, time elapsed:", time.clock() - t) return MSPeakList( sorted(finalPeaks, key=lambda x: x.mass)), adds #checkingSons(finalPeaks), adds
def isotopicPeakListFinder(peaks, isomasses, **kwargs): """ assign an isotopic cluster for each peak, and try to find an idms we may use a system like the CAMERA algorithm to see... input: list of peak must an obj.MSPeakList object clusterLength = 6 never go to six in LOW_RES size expected of an isotopic cluster rtError: maximum drift of the retention time decreaseOrder: allow or not allow that the successive peak of the isotopic cluster intensity are going down, can be confusing for finding idms output: two MSPeakList, the first one corresponding to the peaks with an isotopic cluster and the other one all peaks belonging to an isotopic cluster """ #unpacking parameters print "Isotopic cluster calculation..." rtError = np.float(kwargs.get('rtError', 6)) ppm = np.float(peaks[0].sample.ppm / 1e6) MAX_GAP_ALLOWED = np.int(len(isomasses)) decreaseOrder = kwargs.get('decreaseOrder', True) #we use the less restrictive... mode = kwargs.get('mode', 'Highres') #sort isomasses #isomasses = sorted(isomasses, key=lambda x:x[0]) peaks_with_iso = MSPeakList() peaks_without_iso = MSPeakList( ) #peaks without isotopic cluster but which does not have a isotopic cluster list_iso = set() #MSPeakList() t = time.clock() for peak in peaks.ipeaks(): #iterating over peaks if peak in list_iso: continue #avoid to calculate for every peaks isoCluster = MSClusterList() gap = 0 #isos = resolutionAdjustment(isomasses, peak.mass()*ppm) if mode=='HighRes' else isomasses for i, isomass in enumerate(sorted(isomasses, key=lambda x: x[0])): #pic = _getMatchingPeaks(peaks, peak, isomass[0], ppm, rtError) mass = isomass[0] massToCheck = peak.mass() + mass p = peaks.peaksInMZRange( massToCheck, deltam=ppm * massToCheck if mode == 'HighRes' else 1.) #deltart matchingRtPeaks = MSPeakList( ) #will contain all matching peak in rt for pk in p.ipeaks(): if pk != peak: if abs(peak.rt - pk.rt) <= rtError: matchingRtPeaks.append(pk) if matchingRtPeaks: pic = sorted(matchingRtPeaks, key=lambda pics: abs(pics.mass() - peak.mass()))[ 0] #take the closest in mass if pic is not None: if decreaseOrder: #we want peak area inferior a peak #if isoCluster: areaToCompare = isoCluster[ -1].area if isoCluster else peak.area if areaToCompare < pic.area: #idms found ??? break if pic not in list_iso: #pic not in isoCluster and isoCluster.append(pic) list_iso.add(pic) else: gap += 1 if gap >= MAX_GAP_ALLOWED: break # #set parent for all peaks found if isoCluster: for pics in isoCluster: #pics.parentPeak=peak pics.parentPeak.append(peak) peak.isoCluster = isoCluster peaks_with_iso.addPeak(peak) else: peaks_without_iso.addPeak(peak) # for p in peaks.ipeaks(): # if p not in peaks_with_iso and p not in list_iso: # peaks_without_iso.addPeak(p) print time.clock() - t print "peaks with isotopes: ", len(peaks_with_iso) print "list isotopes: ", len(list_iso) print "peaks without isotopes: ", len(peaks_without_iso) return peaks_with_iso + peaks_without_iso, list_iso