uniq_pept_count = [] pept_probab = [] # aa_before = [] aa_after = [] for pept in uniq_pept: pep_dat_pept,uids = extract_uids(pept,pep_info) if uids: interesting_peptide.append(pept) uids_list.append(uids) ################################# _1,prot_len_fasta,prot_seq_fasta = get_single_fasta(uids,fasta) uid_of_maxlen_list.append(_1) prot_len.append(prot_len_fasta) prot_fasta.append(str(prot_seq_fasta.seq)) peptide_start_in_protein = ms.stupid_aligner(pept,prot_seq_fasta) # 1-based ... peptide_stop_in_protein = peptide_start_in_protein + len(pept) # 1-based ... pept_positions.append(peptide_start_in_protein) prot_name.append(prot_seq_fasta.description.replace(',',' ')) # long protein name here ... # uniq peptide count taken from pep_dat_pept, for definition look up extract_uids... uniq_pept_count_val = pep_dat_pept['Exclusive unique peptide count'].unique()[0] # uniq_pept_count_val, = pep_dat_pept['Exclusive unique peptide count'][pep_dat_pept['Exclusive unique peptide count']>0].unique() uniq_pept_count.append(uniq_pept_count_val) # some kind of peptide probability (like a quality score from experimental data)?! pept_probab_val, = pep_dat_pept['Best Peptide identification probability'].unique() pept_probab.append(pept_probab_val) ################################# # BEWARE: 1-BASED INDEXING ALL THE WAY ACROSS SO FAR... # peptide can start right at N-terminus, so there will be no AminoAcid preceding it, call it a START aa_before.append(str(prot_seq_fasta.seq)[peptide_start_in_protein-2] if peptide_start_in_protein>1 else 'START') # peptide can end right at C-terminus, so there will be no AminoAcid after it, call it an END
# to be continued ... # to be continued ... # extracting UID from protein accession numbers ... # this way we return None for the Unknown entries ... extract_uid = lambda line: line.split('|')[1] if len(line.split('|'))>1 else None # get a single unique Uniprot ID ... pep_info['uid'] = pep_info['Protein accession numbers'].apply(extract_uid) # fetch protein sequence for each of the Uid-s ... fetching = False if fetching: print "fetching from uniprot.org ..." pep_info['fasta'] = pep_info['uid'].apply(lambda _: ms.get_uniprot(session,_)) print "fetching complete" # Align peptide sequence to the extracted protein sequence and find the peptide starting position ... pep_info['my_start'] = pep_info[ ['Peptide sequence','fasta'] ].apply(lambda _:ms.stupid_aligner(*_),axis='columns') # c = ['Protein name', # 'Protein accession numbers', # 'Database sources', # 'Exclusive unique peptide count', # 'Peptide sequence', # 'Previous amino acid', # 'Next amino acid', # 'Peptide start index', # 'Peptide stop index', # 'Star Category', # 'Assigned', # 'Other Proteins',
uniq_pept_count = [] pept_probab = [] # aa_before = [] aa_after = [] for pept in uniq_pept: pep_dat_pept,uids = extract_uids(pept,pep_info) if uids: interesting_peptide.append(pept) uids_list.append(uids) ################################# _1,prot_len_fasta,prot_seq_fasta = get_single_fasta(uids,fasta) uid_of_maxlen_list.append(_1) prot_len.append(prot_len_fasta) prot_fasta.append(str(prot_seq_fasta.seq)) peptide_start_in_protein = ms.stupid_aligner(pept,prot_seq_fasta) peptide_stop_in_protein = peptide_start_in_protein + len(pept) pept_positions.append(ms.stupid_aligner(pept,prot_seq_fasta)) prot_name.append(prot_seq_fasta.description.replace(',',' ')) # long protein name here ... # uniq peptide count taken from pep_dat_pept, for definition look up extract_uids... uniq_pept_count_val = pep_dat_pept['Exclusive unique peptide count'].unique()[0] # uniq_pept_count_val, = pep_dat_pept['Exclusive unique peptide count'][pep_dat_pept['Exclusive unique peptide count']>0].unique() uniq_pept_count.append(uniq_pept_count_val) # some kind of peptide probability (like a quality score from experimental data)?! pept_probab_val, = pep_dat_pept['Best Peptide identification probability'].unique() pept_probab.append(pept_probab_val) ################################# aa_before.append(str(prot_seq_fasta.seq)[peptide_start_in_protein-1]) aa_after.append(str(prot_seq_fasta.seq)[peptide_stop_in_protein+1] if peptide_stop_in_protein+1<prot_len_fasta else 'END') ######################################### dict_df = {