def test_load_peptide_file(self): """Does LoadPeptideFile() store peptide data correctly?""" self.pdl.LoadFastaFile("TEST_files\TEST.fasta") pep = peptide.Peptide() pep.sequence = "MEDVCGRLVQYRGE" pep.accession = "NP_000032" pep.pos_start = 125 pep.pos_end = 138 expectedpep = {} expectedpep["NP_000032"] = [pep] path = "TEST_files\TEST_peptides.txt" testpep = self.pdl.LoadPeptideFile(path, self.pdl.proteins) for item in expectedpep: if item in testpep: count = 0 for val in expectedpep[item]: self.assertPeptideEqual(val, testpep[item][count]) count += 1 else: self.assertTrue( item in testpep, msg="Sequence in expected dictionary not found")
def test_smart_query_peptide(self): """Can SmartQuery properly build a peptide query?""" testpeptide = peptide.Peptide() testpeptide.sequence = "TMNT" testpeptide.accession = "NP_01" testpeptide.pos_start = 2 testpeptide.pos_end = 5 testquery = self.querybuilder.SmartQuery({"NP_01": 1}, False) expectedsubstring = "SELECT * FROM fulltable WHERE (prot_acc = 'NP_01' )" self.assertTrue(expectedsubstring in testquery)
def addCalPeptide(self, peps4spec): ''' @brief processes the peptide data for a single spectrum @param peps4spec <dictionary>: containing all peptides for one spectrum ''' stats = self.stats cfg = self.cfg pepkeys = [x for x in peps4spec.keys() if rx_pepmaindata.search(x)] pepkeys.sort() bits = pepkeys[0].split('_') query = int(bits[0][1:]) spec = self.spectra[query] self.logs.datlog.debug('Query %i has %i peptides' % (query, len(pepkeys))) if len(pepkeys) > 1 and pepkeys[1][-2:] == '10': key = pepkeys.pop(1) pepkeys.append(key) for pep in pepkeys: # collect all the data for each peptide and create Peptide objects rx_pepdata = re.compile('^' + pep + '(?![0-9])') allpeptidedata = {} for x in peps4spec.keys(): if rx_pepdata.search(x): allpeptidedata[x] = peps4spec[x] # first create the peptide instance pepobj = peptide.Peptide(allpeptidedata, self) # then test if the sequence is valid if pepobj.isValidSequence(): # only include peptides if the sequence is valid self.spectra[query]['numpeps'] += 1 stats['numpeps'] += 1 for attr in ['sequence', 'pepno', 'modsVariable', 'modsFixed', 'mass', 'da_delta', 'score']: spec[attr] = getattr(pepobj, attr) accessions = [x['accession'] for x in pepobj.proteins] if len([x for x in accessions if x.startswith('DD') or x.startswith('###REV###') or x.startswith('###RND###')]) == len(accessions): hitType = 'REV' else: hitType = 'FWD' spec['hitType'] = hitType # only process one peptide per query break else: stats['numfailedpeps'] += 1
def test_load_peptide(self): """Can LoadPeptide() store peptide data from file to data structure?""" line = "R.PEPTIDE.C\tref|NP_01.1|gi|111|" expectedpep = peptide.Peptide() expectedpep.sequence = "PEPTIDE" expectedpep.accession = "NP_01" expectedpep.pos_start = 1 expectedpep.pos_end = 7 self.pdl.proteins["NP_01"] = "APEPTIDESEQUENCE" testpep = self.pdl.LoadPeptide(line) self.assertPeptideEqual(expectedpep, testpep)
def LoadPeptide(self, line): #Get the data line = line.strip().split() p = peptide.Peptide() p.sequence = self.ParseSequence(line[0]) p.accession = self.ParseAccession(line[1]) #If the accession number is in the full protein sequences we loaded from the # fasta file, then find where the peptide lies within the protein if p.accession in self.proteins: protein = self.proteins[p.accession] p.pos_start = protein.find(p.sequence) p.pos_end = p.pos_start + len(p.sequence) - 1 return p
def features_and_intensity(dir_path, charge, length, qvalue, ion_type): error = 0 directory = listdir(dir_path) directory.sort() step = 0 error_list = [] zero_sequence = [] for file in directory: # If result file, not extracted file if file.find('MergedFDR.tsv') != -1: print(step) # Result file open fr = open(dir_path + '/' + file) fr.readline() results = [] for line in fr.readlines(): l = line.split('\t') # Charge 2, limit peptide length 11 if int(l[8]) == charge\ and len(get_strip_sequence(l[9])) == length\ and float(l[15]) <= qvalue: # SpecFile, scannum, charge, peptide results.append((l[0], l[2], l[8], l[9])) fr.close() extracts = {} temp_key = None # Extracted file open fe = open(dir_path + '/' + file[:file.find('.tsv')] + EXTRACTED) for line in fe.readlines(): if line == '\n': temp_key = None continue l = line.split('\t') try: # Length 11 if line[0] == '>'\ and len(get_strip_sequence(l[2])) == length: # create dict key extracts[l[0][1:] + l[1]] = [0] * (length - 1) temp_key = l[0][1:] + l[1] elif line[0] == ion_type and temp_key != None: # b ions extracts[temp_key][int(l[0][1:]) - 1] = float(l[2]) except: error_list.append(fe.name) print(fe.name) print(l) error += 1 feat_inten_file_name = file[:file.find('MSGF')] ffi = open('../data/' + ion_type + '/' + feat_inten_file_name + str(length) + '_' + str(charge) + '_' + str(qvalue) + '.txt', 'wt', encoding='utf-8') for result in results: # Write each file separately # 2: charge, 3: peptide p = peptide.Peptide(result[3], result[2], ion_type) feat = p.get_features() # 0: specfile, 2: peptide, 3: qvalue key = result[0] + result[1] if key in extracts.keys(): intensity = extracts[key] for feature in feat: ffi.write(str(feature) + ' ') inten_sum = 0 for inten in intensity: ffi.write(str(inten) + ' ') inten_sum += float(inten) if inten_sum == 0: zero_sequence.append(result[3]) ffi.write('\n') step += 1 ffi.close() fe.close() print(error) # if file is broken f_error = open('../data/' + ion_type + '/' + sys.argv[1][-5:] + '_error.txt', 'wt', encoding='utf-8') for error in error_list: f_error.write(error + '\n') f_error.close() # all intensities are zero f_zero_sequence = open('../data/' + ion_type + '/' + sys.argv[1][-5:] + '_zeros.txt', 'wt', encoding='utf-8') for sequence in zero_sequence: f_zero_sequence.write(sequence + '\n') f_zero_sequence.close()
def addPeptides(self, peps4spec): ''' @brief processes the peptide data for a single spectrum @param peps4spec <dictionary>: containing all peptides for one spectrum ''' stats = self.stats cfg = self.cfg pepkeys = [x for x in peps4spec.keys() if rx_pepmaindata.search(x)] pepkeys.sort() bits = pepkeys[0].split('_') query = int(bits[0][1:]) self.logs.datlog.debug('Query %i has %i peptides' % (query, len(pepkeys))) if len(pepkeys) > 1 and pepkeys[1][-2:] == '10': key = pepkeys.pop(1) pepkeys.append(key) peplist = [] seq2acc = self.seq2acc sequences = self.sequences self.peptidecounter += len(pepkeys) for pep in pepkeys: # collect all the data for each peptide and create Peptide objects rx_pepdata = re.compile('^' + pep + '(?![0-9])') allpeptidedata = {} for x in peps4spec.keys(): if rx_pepdata.search(x): allpeptidedata[x] = peps4spec[x] # first create the peptide instance pepobj = peptide.Peptide(allpeptidedata, self) # then test if the sequence is valid if pepobj.isValidSequence(): # only include peptides if the sequence is valid self.spectra[query]['numpeps'] += 1 stats['numpeps'] += 1 peplist.append(pepobj) else: stats['numfailedpeps'] += 1 # now do the QC of the peptide set if peplist: self.doPeptideSetQC(peplist) hasHook = '' for pep in peplist: if pep.useinprot == 0 or pep.retain == 0: continue seq = pep.sequence score = pep.score if pep.is_hook: hookscore = score hasHook = ', has hook peptide' else: hookscore = 0.0 # build dictionary of pep sequence to protein accession if seq in sequences: sequences[seq] += 1 # accumulate data independently seq2acc[seq]['numpep'] += 1 if pep.is_hook > seq2acc[seq]['hook']: seq2acc[seq]['hook'] = pep.is_hook if hookscore > seq2acc[seq]['hookscore']: seq2acc[seq]['hookscore'] = hookscore if score > seq2acc[seq]['pepscore']: seq2acc[seq]['pepscore'] = score if pep.pepno < seq2acc[seq]['bestczrank']: seq2acc[seq]['bestczrank'] = pep.pepno else: sequences[seq] = 1 seq2acc[seq] = dict(prots=pep.proteins[:], hook=pep.is_hook, numpep=1, hookscore=hookscore, pepscore=score, bestczrank=pep.pepno) self.logs.datlog.debug('%i peptides pass QC%s' % (len(peplist), hasHook))