def check_unimod_pep_input(test_dict): if 'aa_compositions' not in test_dict.keys(): test_dict[ 'aa_compositions'] = ursgal.chemical_composition_kb.aa_compositions cc = ursgal.ChemicalComposition( test_dict['input'], aa_compositions=test_dict['aa_compositions']) print('hill:', cc.hill_notation_unimod()) print(cc.composition_of_aa_at_pos) print(cc.composition_of_mod_at_pos) assert cc.hill_notation_unimod() == test_dict['output'] if 'mod_pos_info' in test_dict.keys(): for mod_dict in test_dict['mod_pos_info']: cc_mods = cc.composition_of_mod_at_pos.get(mod_dict['pos'], None) assert cc_mods == mod_dict['cc_mods'] if 'aa_pos_info' in test_dict.keys(): for aa_dict in test_dict['aa_pos_info']: cc_aa = cc.composition_of_aa_at_pos.get(aa_dict['pos'], None) assert cc_aa == aa_dict['cc_mods']
def main(input_file=None, output_file=None, scan_rt_lookup=None, peptide_regex_lookup=None, params=None, search_engine=None, score_colname=None): ''' Arguments: input_file (str): input filename of csv which should be unified output_file (str): output filename of csv after unifying scan_rt_lookup (dict): dictionary with entries of scanID to retention time under key 'scan_2_rt' force (bool): force True or False params (dict): params as passed by ursgal search_engine(str): the search engine the csv file stems from score_colname (str): the column names of the search engine's score (i.e. 'OMSSA:pvalue') List of fixes All engines * Retention Time (s) is correctly set using _ursgal_lookup.pkl During mzML conversion to mgf the retention time for every spec is stored in a internal lookup and used later for setting the RT. * All modifications are checked if they were given in params['modifications'], converted to the name that was given there and sorted according to their position. * Fixed modifications are added in 'Modifications', if not reported by the engine. * Rows describing the same PSM (i.e. when two proteins share the same peptide) are merged to one row. X!Tandem * 'RTINSECONDS=' is stripped from Spectrum Title if present in .mgf or in search result. Myrimatch * Spectrum Title is corrected * 15N label is not formatted correctly these modifications are removed for further analysis. * When using 15N modifications on amino acids and Carbamidomethyl myrimatch reports sometimes Carboxymethylation on Cystein. MS-GF+ * 15N label is not formatted correctly these modifications are removed for further analysis. * 'Is decoy' column is properly set to true/false * Carbamidomethyl is updated and set if label is 15N OMSSA * Carbamidomethyl is updated and set * Selenocystein is not reported with the correct unimod modification MS-Amanda * Selenocystein is not reported with the correct unimod modification * multiple protein ID per peptide are splitted in two entries. (is done in MS-Amanda postflight) * short protein IDs are mapped to the full protein ID, it is checked which peptides map on which protein ID (is done in MS-Amanda postflight) ''' print( ''' [ unifycsv ] Converting {0} of engine {1} to unified CSV format... '''.format( os.path.basename(input_file), search_engine, ) ) # get the rows which define a unique PSM (i.e. sequence+spec+score...) psm_defining_colnames = get_psm_defining_colnames(score_colname) cc = ursgal.ChemicalComposition() # un = ursgal.UNode() # if peptide_regex_lookup == None: # peptide_regex_lookup = {} # already_seen_protein_pep = {} use15N = False if params['label'] == '15N': use15N = True aa_exception_dict = params['aa_exception_dict'] n_term_replacement = { 'Ammonia-loss' : None, 'Trimethyl' : None, 'Gly->Val' : None, } fixed_mods = {} opt_mods = {} modname2aa = {} cam = False #mod pattern mod_pattern = re.compile( r''':(?P<pos>[0-9]*$)''' ) for modification in params['modifications']: aa = modification.split(',')[0] mod_type = modification.split(',')[1] pos = modification.split(',')[2] name = modification.split(',')[3] if name not in modname2aa.keys(): modname2aa[name] = [] modname2aa[name].append(aa) if 'N-term' in pos: n_term_replacement[name] = aa if mod_type == 'fix': fixed_mods[aa] = name if mod_type == 'opt': opt_mods[aa] = name if 'C,fix,any,Carbamidomethyl' in modification: cam = True ursgal.GlobalUnimodMapper._reparseXML() de_novo_engines = ['novor', 'pepnovo', 'uninovo', 'unknown_engine'] database_search_engines = ['msamanda', 'msgf', 'myrimatch', 'omssa', 'xtandem'] de_novo = False database_search = False for de_novo_engine in de_novo_engines: if de_novo_engine in search_engine.lower(): de_novo = True for db_se in database_search_engines: if db_se in search_engine.lower(): database_search = True psm_counter = Counter() # if a PSM with multiple rows is found (i.e. in omssa results), the psm # rows are merged afterwards output_file_object = open(output_file,'w') mz_buffer = {} csv_kwargs = {} if sys.platform == 'win32': csv_kwargs['lineterminator'] = '\n' else: csv_kwargs['lineterminator'] = '\r\n' with open( input_file, 'r' ) as in_file: csv_input = csv.DictReader( in_file ) csv_output = csv.DictWriter( output_file_object, list(csv_input.fieldnames) + ['uCalc m/z'], **csv_kwargs ) csv_output.writeheader() for line_dict in csv_input: if line_dict['Spectrum Title'] != '': ''' Valid for: OMSSA MSGF+ X!Tandem ''' if 'RTINSECONDS=' in line_dict['Spectrum Title']: line_2_split = line_dict['Spectrum Title'].split(' ')[0] else: line_2_split = line_dict['Spectrum Title'] line_dict['Spectrum Title'] = line_2_split input_file_basename, spectrum_id, _spectrum_id, charge = line_2_split.split('.') elif 'scan=' in line_dict['Spectrum ID']: pure_input_file_name = os.path.basename( line_dict['Raw data location'] ) input_file_basename = pure_input_file_name.split(".")[0] # not # using os.path.splitext because we could have multiple file # extensions (i.e. ".mzml.gz") ''' Valid for: myrimatch ''' spectrum_id = line_dict['Spectrum ID'].split('=')[-1] line_dict['Spectrum Title'] = '{0}.{1}.{1}.{2}'.format( input_file_basename, spectrum_id, line_dict['Charge'] ) elif line_dict['Spectrum Title'] == '': ''' Valid for: Novor ''' pure_input_file_name = os.path.basename( line_dict['Raw data location'] ) input_file_basename = pure_input_file_name.split(".")[0] spectrum_id = line_dict['Spectrum ID'] line_dict['Spectrum Title'] = '{0}.{1}.{1}.{2}'.format( input_file_basename, spectrum_id, line_dict['Charge'] ) else: raise Exception( 'New csv format present for engine {0}'.format( engine ) ) line_dict['Spectrum ID'] = spectrum_id #we should check if data has minute format or second format... try: retention_time_in_minutes = \ scan_rt_lookup[ input_file_basename ][ 'scan_2_rt' ]\ [ spectrum_id ] except KeyError as e: error_msg = ''' Could not find scan ID {0} in scan_rt_lookup[ {1} ] '''.format( spectrum_id, input_file_basename ) raise KeyError( error_msg ) from e if scan_rt_lookup[ input_file_basename ]['unit'] == 'second': rt_corr_factor = 1 else: rt_corr_factor = 60 line_dict['Retention Time (s)'] = float( retention_time_in_minutes ) * rt_corr_factor # # Modification block # some engines do not report fixed modifications # include in unified csv if fixed_mods != {}: for aa, name in fixed_mods.items(): for pos, aminoacid in enumerate(line_dict['Sequence']): if aminoacid == aa: tmp = '{0}:{1}'.format( name, pos + 1 ) if tmp in line_dict['Modifications']: # everything is ok :) pass else: tmp_mods = line_dict['Modifications'].split(';') tmp_mods.append(tmp) line_dict['Modifications'] = ';'.join( tmp_mods ) # Myrimatch and msgf+ can not handle 15N that easily # report all AAs moded with unknown modification # Note: masses are checked below to avoid any mismatch if use15N: if 'myrimatch' in search_engine.lower() or \ 'msgf' in search_engine.lower(): line_dict['Modifications'] = re.sub( 'unknown modification:[0-9]*', '', line_dict['Modifications'] ) if 'myrimatch' in search_engine.lower(): if 'Carboxymethyl' in line_dict['Modifications'] and cam == True: line_dict['Modifications'] = line_dict['Modifications'].replace( 'Carboxymethyl', 'Carbamidomethyl' ) elif 'Delta:H(6)C(3)O(1)' in line_dict['Modifications']: line_dict['Modifications'] = line_dict['Modifications'].replace( 'Delta:H(6)C(3)O(1)', 'Carbamidomethyl' ) tmp_mods = [] for modification in line_dict['Modifications'].split(';'): Nterm = False if modification == '': continue pos, mod = None, None for match in mod_pattern.finditer( modification ): pos = int( match.group('pos') ) mod = modification[ :match.start() ] break assert pos != None,''' The format of the modification {0} is not recognized by ursgal'''.format( modification ) # old version, does not work with ':' in modification # mod = modification.split(':')[0] # pos = int(modification.split(':')[1]) if pos == 0 or pos == 1: Nterm = True pos = 1 aa = line_dict['Sequence'][pos-1] if mod in modname2aa.keys(): correct_mod = False if aa in modname2aa[mod]: # everything is ok correct_mod = True elif Nterm and '*' in modname2aa[mod]: correct_mod = True # still is ok assert correct_mod == True,''' A modification was reported for an aminoacid for which it was not defined unify_csv cannot deal with this, please check your parameters and engine output reported modification: {0} on {1} modifications in parameters: {2} '''.format( mod, aa, params['modifications'] ) elif 'unknown modification' == mod: modification_known = False if aa in opt_mods.keys(): # fixed mods are corrected/added already modification = '{0}:{1}'.format(opt_mods[aa],pos) modification_known = True assert modification_known == True,''' unify csv does not work for the given unknown modification for {0} {1} maybe an unknown modification with terminal position was given? '''.format( line_dict['Sequence'], modification ) else: try: name_list = ursgal.GlobalUnimodMapper.appMass2name_list( round(float(mod), 4), decimal_places = 4 ) except: print(''' A modification was reported that was not included in the search parameters unify_csv cannot deal with this, please check your parameters and engine output reported modification: {0} modifications in parameters: {1} '''.format(mod, params['modifications']) ) raise Exception('unify_csv failed because a '\ 'modification was reported that was not '\ 'given in params.' ) mapped_mod = False for name in name_list: if name in modname2aa.keys(): if aa in modname2aa[name]: modification = '{0}:{1}'.format(name,pos) mapped_mod = True elif Nterm and '*' in modname2aa[name]: modification = '{0}:{1}'.format(name,0) mapped_mod = True else: continue assert mapped_mod == True, ''' A mass was reported that does not map on any unimod or userdefined modification or the modified aminoacid is no the specified one unify_csv cannot deal with this, please check your parameters and engine output reported mass: {0} maps on: {1} reported modified aminoacid: {2} modifications in parameters: {3} '''.format( mod, name_list, aa, params['modifications'] ) tmp_mods.append(modification) line_dict['Modifications'] = ';'.join( tmp_mods ) for unimod_name in n_term_replacement.keys(): if '{0}:1'.format(unimod_name) in line_dict['Modifications']: replace = False if unimod_name in modname2aa.keys(): aa = modname2aa[unimod_name] if aa != '*': if line_dict['Sequence'][0] == aa: continue line_dict['Modifications'] = line_dict['Modifications'].replace( '{0}:1'.format( unimod_name ), '{0}:0'.format( unimod_name ) ) for aa_to_replace, replace_dict in aa_exception_dict.items(): if aa_to_replace in line_dict['Sequence']: #change mods only if unimod has to be changed... if 'unimod_name' in replace_dict.keys(): for r_pos, aa in enumerate(line_dict['Sequence']): if aa == aa_to_replace: index_of_U = r_pos + 1 unimod_name = replace_dict['unimod_name'] if cam: unimod_name = replace_dict['unimod_name_with_cam'] new_mod = '{0}:{1}'.format( unimod_name, index_of_U ) if line_dict['Modifications'] == '': line_dict['Modifications'] += new_mod else: line_dict['Modifications'] += ';{0}'.format( new_mod ) line_dict['Sequence'] = line_dict['Sequence'].replace( aa_to_replace, replace_dict['original_aa'] ) # remove the double ';'' if line_dict['Modifications'] != '': tmp = [] for e in line_dict['Modifications'].split(';'): if e == '': # that remove the doubles .... continue else: # other way to do it... # pos_of_split_point = re.search( ':\d*\Z', e ) # pattern = re.compile( r''':(?P<pos>[0-9]*$)''' ) for occ, match in enumerate( mod_pattern.finditer( e )): mod = e[:match.start()] mod_pos = e[match.start()+1:] # mod, pos = e.split(':') m = (int(mod_pos), mod) if m not in tmp: tmp.append( m ) tmp.sort() line_dict['Modifications'] = ';'.join( [ '{m}:{p}'.format( m=mod, p=pos) for pos, mod in tmp ] ) # caculate m/z upep = line_dict['Sequence'] + '#' + line_dict['Modifications'] buffer_key = (upep, line_dict['Charge'], params['label']) if buffer_key not in mz_buffer.keys(): cc.use(upep) if use15N: number_N = dc( cc['N'] ) cc['15N'] = number_N del cc['N'] if cam: c_count = line_dict['Sequence'].count('C') cc['14N'] = c_count cc['15N'] -= c_count # mass = mass + ( DIFFERENCE_14N_15N * number_N ) mass = cc._mass() calc_mz = ursgal.ucore.calculate_mz( mass, line_dict['Charge'] ) mz_buffer[ buffer_key ] = calc_mz else: calc_mz = mz_buffer[ buffer_key ] line_dict['uCalc m/z'] = calc_mz if 'msamanda' in search_engine.lower(): # ms amanda does not return calculated mz values line_dict['Calc m/z'] = calc_mz # protein block, only for database search engine if database_search == True: # check if proteinacc_start_stop_pre_post is correct ... work in progress tmp_decoy = set() tmp_proteinacc = [] for protein in line_dict['proteinacc_start_stop_pre_post_;'].split('<|>'): # match = re.search('_\d+_\d+_[A-Z-]_[A-Z-]', protein) # if match == None: # id_stop = len(protein) # else: # id_stop = match.start() # protein_id = protein[0:id_stop] # peptide = line_dict['Sequence'] # protein_pep = '{0}_{1}'.format(protein_id, peptide) # database_protein_pep = '{0}_{1}'.format( # params['database'], # protein_id, # peptide # ) # allowed_aa = params['enzyme'][0] + '-' # cleavage_site = params['enzyme'][1] + '-' # if protein_pep not in already_seen_protein_pep: # if database_protein_pep not in peptide_regex_lookup: # peptide_regex_lookup[database_protein_pep] = un.peptide_regex( # params['database'], # protein_id, # peptide # ) # returned_peptide_regex_list = peptide_regex_lookup[database_protein_pep] # corr_proteinacc_start_stop_pre_post = [] # for protein in returned_peptide_regex_list: # for pep_regex in protein: # print(pep_regex) # nterm_correct = False # cterm_correct = False # start, stop, pre_aa, post_aa, returned_protein_id = pep_regex # proteinacc_start_stop_pre_post = '{0}_{1}_{2}_{3}_{4}'.format( # returned_protein_id, # start, # stop, # pre_aa, # post_aa # ) # # if cleavage_site == 'C': # if pre_aa in allowed_aa: # nterm_correct = True # if peptide[-1] in allowed_aa: # cterm_correct = True # elif cleavage_site == 'N': # if peptide[0] in allowed_aa: # nterm_correct = True # if post_aa not in allowed_aa: # cterm_correct = True # if params['semi_enzyme'] == True: # if cterm_correct == True or nterm_correct == True: # corr_proteinacc_start_stop_pre_post.append(proteinacc_start_stop_pre_post) # elif cterm_correct == True and nterm_correct == True: # corr_proteinacc_start_stop_pre_post.append(proteinacc_start_stop_pre_post) # already_seen_protein_pep[protein_pep] = corr_proteinacc_start_stop_pre_post # corr_proteinacc_start_stop_pre_post = already_seen_protein_pep[protein_pep] # mzidentml-lib does not always set 'Is decoy' correctly # (it's always 'false' for MS-GF+ results), this is fixed here: if params['decoy_tag'] in protein: tmp_decoy.add('true') else: tmp_decoy.add('false') if len(tmp_decoy) >= 2: print( ''' [ WARNING ] The following peptide occurs in a target as well as decoy protein [ WARNING ] {0} [ WARNING ] 'Is decoy' has been set to 'True' '''.format( line_dict['Sequence'], ) ) line_dict['Is decoy'] = 'true' else: line_dict['Is decoy'] = list(tmp_decoy)[0] # count each PSM occurence to check whether row-merging is needed: psm = tuple([line_dict[x] for x in psm_defining_colnames]) psm_counter[psm] += 1 #csv_output.writerow(line_dict) #wrong indentation csv_output.writerow(line_dict) ''' to_be_written_csv_lines.append( line_dict ) ''' output_file_object.close() # if there are multiple rows for a PSM, we have to merge them aka rewrite the csv... if psm_counter != Counter(): if max(psm_counter.values()) > 1: merge_duplicate_psm_rows(output_file, psm_counter, psm_defining_colnames) ''' to_be_written_csv_lines = merge_duplicate_psm_rows( to_be_written_csv_lines, psm_counter ) ''' ''' do output_file magic with to_be_written_csv_lines ''' return peptide_regex_lookup
}, }, { "composition_1": { "N": 1, "14N": 1, "12C": 1 }, "composition_2": { "N": 2, "C": 1 }, }, ] cc_1 = ursgal.ChemicalComposition() cc_2 = ursgal.ChemicalComposition() def pepitde_with_unimod_test(): for test_id, test_dict in enumerate(TESTS): yield mass_checker, test_dict def mass_checker(test_dict): cc_1.add_chemical_formula(test_dict["composition_1"]) cc_2.add_chemical_formula(test_dict["composition_2"]) assert cc_1._mass() == cc_2._mass() cc_1.clear() cc_2.clear()
def preflight(self): ''' Formatting the command line and writing the param input file via self.params Returns: dict: self.params ''' self.input_file = os.path.join(self.params['input_dir_path'], self.params['input_file']) self.param_file_name = os.path.join( self.params['output_dir_path'], '{0}_msfragger.params'.format(self.input_file)) self.created_tmp_files.append(self.param_file_name) # further prepare and translate params # pprint.pprint(self.params['translations']['_grouped_by_translated_key']) # pprint.pprint(self.params) # exit() self.params_to_write = { 'output_file_extension': 'tsv', # tsv or pepXML we fix it... 'output_format': 'tsv', # pepXML or tsv 'digest_mass_range': '{0} {1}'.format( self.params['translations']['_grouped_by_translated_key'] ['precursor_min_mass']['precursor_min_mass'], self.params['translations']['_grouped_by_translated_key'] ['precursor_max_mass']['precursor_max_mass']) } write_exclusion_list = [ 'precursor_min_mass', 'precursor_max_mass', 'precursor_min_charge', 'precursor_max_charge', 'label', '-Xmx', 'header_translations', 'validation_score_field' ] additional_15N_modifications = [] if self.params['translations']['_grouped_by_translated_key']['label'][ 'label'] == '15N': self.print_info( 'Search with label=15N may still be errorprone. Evaluate with care!', caller='WARNING') for aminoacid, N15_Diff in ursgal.ukb.DICT_15N_DIFF.items(): existing = False for mod_dict in self.params['mods']['fix']: if aminoacid == mod_dict['aa']: mod_dict['mass'] += N15_Diff mod_dict['name'] += '_15N_{0}'.format(aminoacid) existing = True if existing == True: continue else: mod_key = 'add_{0}_{1}'.format( aminoacid, ursgal.chemical_composition_kb.aa_names[aminoacid]) self.params_to_write[mod_key] = N15_Diff self.mass_shift_lookup = {} self.mass_glycan_lookup = {} for msfragger_param_name in self.params['translations'][ '_grouped_by_translated_key'].keys(): for ursgal_param_name, param_value in self.params['translations'][ '_grouped_by_translated_key'][msfragger_param_name].items( ): if msfragger_param_name in write_exclusion_list: continue elif msfragger_param_name == 'enzyme': ''' search_enzyme_name = Trypsin search_enzyme_cutafter = KR search_enzyme_butnotafter = P ''' aa_site, term, inhibitor = param_value.split(';') self.params_to_write['search_enzyme_name'] = self.params[ 'enzyme'] self.params_to_write['search_enzyme_cutafter'] = aa_site self.params_to_write[ 'search_enzyme_butnotafter'] = inhibitor elif msfragger_param_name == 'num_enzyme_termini': # num_enzyme_termini = 2 # 2 for enzymatic, 1 for # semi-enzymatic, 0 for nonspecific digestion if self.params['translations'][ '_grouped_by_translated_key']['enzyme'][ 'enzyme'] == 'nonspecific': self.params_to_write[msfragger_param_name] = 0 else: self.params_to_write[ msfragger_param_name] = param_value elif msfragger_param_name == 'clear_mz_range': min_mz, max_mz = param_value self.params_to_write[ msfragger_param_name] = '{0} {1}'.format( min_mz, max_mz) elif msfragger_param_name == 'remove_precursor_range': min_mz, max_mz = param_value self.params_to_write[ msfragger_param_name] = '{0},{1}'.format( min_mz, max_mz) elif msfragger_param_name == 'delta_mass_exclude_ranges': min_mz, max_mz = param_value self.params_to_write[ msfragger_param_name] = '({0},{1})'.format( min_mz, max_mz) elif msfragger_param_name == 'precursor_mass_lower': self.params_to_write[ msfragger_param_name] = -1 * param_value elif msfragger_param_name == 'modifications': ''' #maximum of 7 mods - amino acid codes, * for any amino acid, [ and ] specifies protein termini, n and c specifies peptide termini variable_mod_01 = 15.9949 M variable_mod_02 = 42.0106 [* #variable_mod_03 = 79.96633 STY #variable_mod_03 = -17.0265 nQnC #variable_mod_04 = -18.0106 nE ''' # print(self.params['translations']['_grouped_by_translated_key'][msfragger_param_name]) # pprint.pprint(self.params[ 'mods' ]) # exit() mass_to_mod_aa = ddict(list) for mod_dict in self.params['mods']['opt']: ''' {'_id': 0, 'aa': '*', 'composition': {'C': 2, 'H': 2, 'O': 1}, 'id': '1', 'mass': 42.010565, 'name': 'Acetyl', 'org': '*,opt,Prot-N-term,Acetyl', 'pos': 'Prot-N-term', 'unimod': True}, ''' aa_to_append = mod_dict['aa'] pos_modifier = None if mod_dict['pos'] == 'Prot-N-term': pos_modifier = '[' elif mod_dict['pos'] == 'Prot-C-term': pos_modifier = ']' elif mod_dict['pos'] == 'N-term': pos_modifier = 'n' elif mod_dict['pos'] == 'C-term': pos_modifier = 'c' elif mod_dict['pos'] == 'any': pass else: print(''' Unknown positional argument for given modification: {0} MSFragger cannot deal with this, please use one of the follwing: any, Prot-N-term, Prot-C-term, N-term, C-term '''.format(mod_dict['org'])) sys.exit(1) if pos_modifier is not None: aa_to_append = '{0}{1}'.format( pos_modifier, aa_to_append) mass_to_mod_aa[mod_dict['mass']].append(aa_to_append) for pos, (mass, aa_list) in enumerate(mass_to_mod_aa.items()): self.params_to_write['variable_mod_0{0}'.format( pos + 1)] = '{0} {1}'.format( mass, ''.join(aa_list)) for mod_dict in self.params['mods']['fix']: ''' add_C_cysteine = 57.021464 # added to C - avg. 103.1429, mono. 103.00918 ''' if mod_dict['pos'] == 'Prot-N-term': mod_key = 'add_Nterm_protein' elif mod_dict['pos'] == 'Prot-C-term': mod_key = 'add_Cterm_protein' elif mod_dict['pos'] == 'N-term': mod_key = 'add_Nterm_peptide' elif mod_dict['pos'] == 'C-term': mod_key = 'add_Cterm_peptide' else: mod_key = 'add_{0}_{1}'.format( mod_dict['aa'], ursgal.chemical_composition_kb.aa_names[ mod_dict['aa']]) self.params_to_write[mod_key] = mod_dict['mass'] elif msfragger_param_name == 'override_charge': self.params_to_write[msfragger_param_name] = param_value if param_value == 1: self.params_to_write[ 'precursor_charge'] = '{0} {1}'.format( self.params['translations'] ['_grouped_by_translated_key'] ['precursor_min_charge'] ['precursor_min_charge'], self.params['translations'] ['_grouped_by_translated_key'] ['precursor_max_charge'] ['precursor_max_charge']) elif msfragger_param_name == 'fragment_ion_series': ion_list = [] for ion in param_value: if ion not in [ 'a', 'b', 'c', 'y~', 'x', 'y', 'z', 'b~', 'y-18', 'b-18', 'Y', ]: print(''' [ WARNING ] MSFragger does not allow the following ion: {0} This ion will be skipped, i.e. not included in the search. '''.format(ion)) continue ion_list.append(ion) self.params_to_write[msfragger_param_name] = ','.join( ion_list) elif msfragger_param_name in [ 'mass_offsets', 'Y_type_masses', ]: cc = ursgal.ChemicalComposition() umama = ursgal.UnimodMapper() masses = [] for m in param_value['masses']: masses.append(str(m)) for m in param_value['glycans']: cc.clear() cc.add_glycan(m) mass = cc._mass() masses.append(str(mass)) # for tm in self.transform_mass_add_error(mass): tm = round(mass * 1e5) if tm not in self.mass_glycan_lookup.keys(): self.mass_glycan_lookup[tm] = set() self.mass_glycan_lookup[tm].add(m) for m in param_value['chemical_formulas']: cc.clear() cc.add_chemical_formula(m) mass = cc._mass() masses.append(str(mass)) # for tm in self.transform_mass_add_error(mass): tm = round(mass * 1e5) if tm not in self.mass_shift_lookup.keys(): self.mass_shift_lookup[tm] = set() self.mass_shift_lookup[tm].add(m) for m in param_value['unimods']: unimod_mass = umama.name2mass(m) masses.append(str(unimod_mass)) # for tm in self.transform_mass_add_error(unimod_mass): tm = round(mass * 1e5) if tm not in self.mass_shift_lookup.keys(): self.mass_shift_lookup[tm] = set() self.mass_shift_lookup[tm].add(m) self.params_to_write[msfragger_param_name] = '/'.join( masses) elif msfragger_param_name == 'diagnostic_fragments': cc = ursgal.ChemicalComposition() umama = ursgal.UnimodMapper() masses = [] for m in param_value['masses']: masses.append(m) for m in param_value['glycans']: cc.clear() cc.add_glycan(m) masses.append(cc._mass()) for m in param_value['chemical_formulas']: cc.clear() cc.add_chemical_formula(m) masses.append(cc._mass()) for m in param_value['unimods']: unimod_mass = umama.name2mass(m) masses.append(unimod_mass) mzs = [] for mass in masses: mzs.append(str(ursgal.ucore.calculate_mz(mass, 1))) self.params_to_write[msfragger_param_name] = '/'.join(mzs) else: self.params_to_write[msfragger_param_name] = param_value self.write_params_file() if self.input_file.lower().endswith('.mzml') or \ self.input_file.lower().endswith('.mzml.gz') or \ self.input_file.lower().endswith('.mgf'): self.params['translations']['mzml_input_file'] = self.input_file # elif self.input_file.lower().endswith('.mgf'): # self.params['translations']['mzml_input_file'] = \ # self.meta_unodes['ucontroller'].get_mzml_that_corresponds_to_mgf( self.input_file ) # self.print_info( # 'MSFragger can only read Proteowizard MGF input files,' # 'the corresponding mzML file {0} will be used instead.'.format( # os.path.abspath(self.params['translations']['mzml_input_file']) # ), # caller = "INFO" # ) else: raise Exception( 'MSFragger input spectrum file must be in mzML or MGF format!') self.params['command_list'] = [ 'java', '-Xmx{0}'.format(self.params['translations'] ['_grouped_by_translated_key']['-Xmx']['-xmx']), '-jar', self.exe, self.param_file_name, self.params['translations']['mzml_input_file'] ] self.params['translations']['output_file_incl_path'] = os.path.join( self.params['output_dir_path'], self.params['output_file']) return self.params
def main(input_file=None, output_file=None, scan_rt_lookup=None, params=None, search_engine=None, score_colname=None, upeptide_mapper=None): ''' Arguments: input_file (str): input filename of csv which should be unified output_file (str): output filename of csv after unifying scan_rt_lookup (dict): dictionary with entries of scanID to retention time under key 'scan_2_rt' force (bool): force True or False params (dict): params as passed by ursgal search_engine(str): the search engine the csv file stems from score_colname (str): the column names of the search engine's score (i.e. 'OMSSA:pvalue') List of fixes All engines * Retention Time (s) is correctly set using _ursgal_lookup.pkl During mzML conversion to mgf the retention time for every spec is stored in a internal lookup and used later for setting the RT. * All modifications are checked if they were given in params['modifications'], converted to the name that was given there and sorted according to their position. * Fixed modifications are added in 'Modifications', if not reported by the engine. * The monoisotopic m/z for for each line is calculated (uCalc m/z), since not all engines report the monoisotopic m/z * Mass accuracy calculation (in ppm), also taking into account that not always the monoisotopic peak is picked * All peptide Sequences are remapped to their corresponding protein, assuring correct start, stop, pre and post aminoacid. Thereby, also correct enzymatic cleavage is checked. * Rows describing the same PSM (i.e. when two proteins share the same peptide) are merged to one row. X!Tandem * 'RTINSECONDS=' is stripped from Spectrum Title if present in .mgf or in search result. Myrimatch * Spectrum Title is corrected * 15N label is not formatted correctly these modifications are removed for further analysis. * When using 15N modifications on amino acids and Carbamidomethyl myrimatch reports sometimes Carboxymethylation on Cystein. MS-GF+ * 15N label is not formatted correctly these modifications are removed for further analysis. * 'Is decoy' column is properly set to true/false * Carbamidomethyl is updated and set if label is 15N OMSSA * Carbamidomethyl is updated and set * Selenocystein is not reported with the correct unimod modification MS-Amanda * Selenocystein is not reported with the correct unimod modification * multiple protein ID per peptide are splitted in two entries. (is done in MS-Amanda postflight) * short protein IDs are mapped to the full protein ID, it is checked which peptides map on which protein ID (is done in MS-Amanda postflight) ''' print(''' [ unifycsv ] Converting {0} of engine {1} to unified CSV format... '''.format( os.path.basename(input_file), search_engine, )) # get the rows which define a unique PSM (i.e. sequence+spec+score...) psm_defining_colnames = get_psm_defining_colnames(score_colname) joinchar = params['translations']['protein_delimiter'] do_not_delete = False created_tmp_files = [] use15N = False if 'label' in params.keys(): if params['label'] == '15N': use15N = True else: params['label'] = '14N' # print(use15N) # exit() aa_exception_dict = params['translations']['aa_exception_dict'] n_term_replacement = { 'Ammonia-loss': None, 'Trimethyl': None, 'Gly->Val': None, } fixed_mods = {} opt_mods = {} modname2aa = {} cam = False # mod pattern mod_pattern = re.compile(r''':(?P<pos>[0-9]*$)''') for modification in params['translations']['modifications']: aa = modification.split(',')[0] mod_type = modification.split(',')[1] pos = modification.split(',')[2] name = modification.split(',')[3] if name not in modname2aa.keys(): modname2aa[name] = [] modname2aa[name].append(aa) if 'N-term' in pos: n_term_replacement[name] = aa if mod_type == 'fix': fixed_mods[aa] = name if mod_type == 'opt': opt_mods[aa] = name if 'C,fix,any,Carbamidomethyl' in modification: cam = True cc = ursgal.ChemicalComposition() ursgal.GlobalUnimodMapper._reparseXML() de_novo_engines = ['novor', 'pepnovo', 'uninovo', 'unknown_engine'] database_search_engines = [ 'msamanda', 'msgf', 'myrimatch', 'omssa', 'xtandem' ] de_novo = False database_search = False for de_novo_engine in de_novo_engines: if de_novo_engine in search_engine.lower(): de_novo = True for db_se in database_search_engines: if db_se in search_engine.lower(): database_search = True if upeptide_mapper is None: upapa = ursgal.UPeptideMapper() else: upapa = upeptide_mapper if database_search is True: target_decoy_peps = set() non_enzymatic_peps = set() pep_map_lookup = {} fasta_lookup_name = upapa.build_lookup_from_file( params['translations']['database'], force=False, ) # print('Cached!') # input() psm_counter = Counter() # if a PSM with multiple rows is found (i.e. in omssa results), the psm # rows are merged afterwards output_file_object = open(output_file, 'w') protein_id_output = open(output_file + '_full_protein_names.txt', 'w') mz_buffer = {} csv_kwargs = {'extrasaction': 'ignore'} if sys.platform == 'win32': csv_kwargs['lineterminator'] = '\n' else: csv_kwargs['lineterminator'] = '\r\n' total_lines = len(list(csv.reader(open(input_file, 'r')))) ze_only_buffer = {} if params['translations']['enzyme'] != 'nonspecific': allowed_aa, cleavage_site, inhibitor_aa = params['translations'][ 'enzyme'].split(';') else: allowed_aa = ''.join(list(ursgal.ursgal_kb.NITROGENS.keys())) cleavage_site = 'C' inhibitor_aa = '' allowed_aa += '-' with open(input_file, 'r') as in_file: csv_input = csv.DictReader(in_file) output_fieldnames = list(csv_input.fieldnames) for remove_fieldname in [ 'proteinacc_start_stop_pre_post_;', 'Start', 'Stop', 'NIST score', 'gi', 'Accession', ]: if remove_fieldname not in output_fieldnames: continue output_fieldnames.remove(remove_fieldname) new_fieldnames = [ 'uCalc m/z', 'Accuracy (ppm)', 'Protein ID', 'Sequence Start', 'Sequence Stop', 'Sequence Pre AA', 'Sequence Post AA', ] for new_fieldname in new_fieldnames: if new_fieldname not in output_fieldnames: output_fieldnames.insert(-5, new_fieldname) csv_output = csv.DictWriter(output_file_object, output_fieldnames, **csv_kwargs) csv_output.writeheader() print('''[ unify_cs ] parsing csv''') import time for line_nr, line_dict in enumerate(csv_input): if line_nr % 500 == 0: print( '[ unify_cs ] Processing line number: {0}/{1} .. '.format( line_nr, total_lines, ), end='\r') if line_dict['Spectrum Title'] != '': ''' Valid for: OMSSA MSGF+ X!Tandem ''' if 'RTINSECONDS=' in line_dict['Spectrum Title']: line_2_split = line_dict['Spectrum Title'].split( ' ')[0].strip() else: line_2_split = line_dict['Spectrum Title'] line_dict['Spectrum Title'] = line_2_split input_file_basename, spectrum_id, _spectrum_id, charge = line_2_split.split( '.') pure_input_file_name = '' elif 'scan=' in line_dict['Spectrum ID']: pure_input_file_name = os.path.basename( line_dict['Raw data location']) input_file_basename = pure_input_file_name.split(".")[0] # not using os.path.splitext because we could have multiple file # extensions (i.e. ".mzml.gz") ''' Valid for: myrimatch ''' spectrum_id = line_dict['Spectrum ID'].split('=')[-1] line_dict['Spectrum Title'] = '{0}.{1}.{1}.{2}'.format( input_file_basename, spectrum_id, line_dict['Charge']) elif line_dict['Spectrum Title'] == '': ''' Valid for: Novor ''' pure_input_file_name = os.path.basename( line_dict['Raw data location']) input_file_basename = pure_input_file_name.split(".")[0] spectrum_id = line_dict['Spectrum ID'] line_dict['Spectrum Title'] = '{0}.{1}.{1}.{2}'.format( input_file_basename, spectrum_id, line_dict['Charge']) else: raise Exception( 'New csv format present for engine {0}'.format(engine)) #update spectrum ID from block above line_dict['Spectrum ID'] = spectrum_id # now check for the basename in the scan rt lookup # possible cases: # - input_file_basename # - input_file_basename + prefix # - input_file_basename - prefix input_file_basename_for_rt_lookup = None if input_file_basename in scan_rt_lookup.keys(): input_file_basename_for_rt_lookup = input_file_basename else: basename_with_prefix = '{0}_{1}'.format( params['prefix'], input_file_basename) basename_without_prefix = input_file_basename.replace( params['prefix'], '') if basename_with_prefix in scan_rt_lookup.keys(): input_file_basename_for_rt_lookup = basename_with_prefix elif basename_without_prefix in scan_rt_lookup.keys(): input_file_basename_for_rt_lookup = basename_without_prefix else: print(''' Could not find scan ID {0} in scan_rt_lookup[ {1} ] '''.format(spectrum_id, input_file_basename)) retention_time_in_minutes = \ scan_rt_lookup[ input_file_basename_for_rt_lookup ][ 'scan_2_rt' ]\ [ spectrum_id ] #we should check if data has minute format or second format... if scan_rt_lookup[input_file_basename]['unit'] == 'second': rt_corr_factor = 1 else: rt_corr_factor = 60 line_dict['Retention Time (s)'] = float( retention_time_in_minutes) * rt_corr_factor # # now lets buffer for real !! :) # _ze_ultra_buffer_key_ = '{Sequence} || {Charge} || {Modifications} || '.format( **line_dict) + params['label'] if _ze_ultra_buffer_key_ not in ze_only_buffer.keys(): line_dict_update = {} # # Modification block # some engines do not report fixed modifications # include in unified csv if fixed_mods != {}: for pos, aminoacid in enumerate(line_dict['Sequence']): if aminoacid in fixed_mods.keys(): name = fixed_mods[aminoacid] tmp = '{0}:{1}'.format(name, pos + 1) if tmp in line_dict['Modifications']: # everything is ok :) pass else: tmp_mods = line_dict['Modifications'].split( ';') tmp_mods.append(tmp) line_dict['Modifications'] = ';'.join(tmp_mods) # Myrimatch and msgf+ can not handle 15N that easily # report all AAs moded with unknown modification # Note: masses are checked below to avoid any mismatch if use15N: if 'myrimatch' in search_engine.lower() or \ 'msgfplus_v9979' in search_engine.lower(): for p in range(1, len(line_dict['Sequence']) + 1): line_dict['Modifications'] = \ line_dict['Modifications'].replace( 'unknown modification:{0}'.format(p), '', 1, ) if 'myrimatch' in search_engine.lower(): if 'Carboxymethyl' in line_dict[ 'Modifications'] and cam == True: line_dict['Modifications'] = line_dict[ 'Modifications'].replace( 'Carboxymethyl', 'Carbamidomethyl') elif 'Delta:H(6)C(3)O(1)' in line_dict[ 'Modifications']: line_dict['Modifications'] = line_dict[ 'Modifications'].replace( 'Delta:H(6)C(3)O(1)', 'Carbamidomethyl') tmp_mods = [] for modification in line_dict['Modifications'].split(';'): Nterm = False Cterm = False skip_mod = False if modification == '': continue pos, mod = None, None match = mod_pattern.search(modification) pos = int(match.group('pos')) mod = modification[:match.start()] assert pos is not None, ''' The format of the modification {0} is not recognized by ursgal'''.format(modification) if pos <= 1: Nterm = True new_pos = 1 elif pos > len(line_dict['Sequence']): Cterm = True new_pos = len(line_dict['Sequence']) else: new_pos = pos aa = line_dict['Sequence'][new_pos - 1].upper() # if aa in fixed_mods.keys(): # fixed_mods[ aminoacid ] # # fixed mods are corrected/added already # continue if mod in modname2aa.keys(): correct_mod = False if aa in modname2aa[mod]: # everything is ok correct_mod = True elif Nterm or Cterm: if '*' in modname2aa[mod]: correct_mod = True # still is ok assert correct_mod is True, ''' A modification was reported for an aminoacid for which it was not defined unify_csv cannot deal with this, please check your parameters and engine output reported modification: {0} on {1} modifications in parameters: {2} '''.format( mod, aa, params['translations']['modifications']) elif 'unknown modification' == mod: modification_known = False if aa in opt_mods.keys(): # fixed mods are corrected/added already modification = '{0}:{1}'.format( opt_mods[aa], new_pos) modification_known = True assert modification_known == True, ''' unify csv does not work for the given unknown modification for {0} {1} aa: {2} maybe an unknown modification with terminal position was given? '''.format(line_dict['Sequence'], modification, aa) else: if aa in fixed_mods.keys() and use15N \ and 'msgfplus' in search_engine.lower(): if pos != 0: mod = float( mod) - ursgal.ursgal_kb.DICT_15N_DIFF[aa] try: name_list = ursgal.GlobalUnimodMapper.appMass2name_list( round(float(mod), 3), decimal_places=3) except: print(''' A modification was reported that was not included in the search parameters unify_csv cannot deal with this, please check your parameters and engine output reported modification: {0} modifications in parameters: {1} '''.format( mod, params['translations']['modifications'])) raise Exception('unify_csv failed because a '\ 'modification was reported that was not '\ 'given in params.' '{0}'.format(modification) ) mapped_mod = False for name in name_list: if name in modname2aa.keys(): if aa in modname2aa[name]: modification = '{0}:{1}'.format( name, new_pos) mapped_mod = True elif Nterm and '*' in modname2aa[name]: modification = '{0}:{1}'.format(name, 0) mapped_mod = True else: continue elif use15N and name in [ 'Label:15N(1)', 'Label:15N(2)', 'Label:15N(3)', 'Label:15N(4)' ]: mapped_mod = True skip_mod = True break assert mapped_mod is True, ''' A mass was reported that does not map on any unimod or userdefined modification or the modified aminoacid is not the specified one unify_csv cannot deal with this, please check your parameters and engine output reported mass: {0} maps on: {1} reported modified aminoacid: {2} modifications in parameters: {3} '''.format( mod, name_list, aa, params['translations']['modifications']) if modification in tmp_mods or skip_mod is True: continue tmp_mods.append(modification) line_dict_update['Modifications'] = ';'.join(tmp_mods) # # ^^--------- REPLACED MODIFICATIONS! ---------------^ # for unimod_name in n_term_replacement.keys(): if '{0}:1'.format(unimod_name) in line_dict_update[ 'Modifications'].split(';'): if unimod_name in modname2aa.keys(): aa = modname2aa[unimod_name] if aa != ['*']: if line_dict['Sequence'][0] in aa: continue line_dict_update['Modifications'] = line_dict_update[ 'Modifications'].replace( '{0}:1'.format(unimod_name), '{0}:0'.format(unimod_name)) for aa_to_replace, replace_dict in aa_exception_dict.items(): if aa_to_replace in line_dict['Sequence']: #change mods only if unimod has to be changed... if 'unimod_name' in replace_dict.keys(): for r_pos, aa in enumerate(line_dict['Sequence']): if aa == aa_to_replace: index_of_U = r_pos + 1 unimod_name = replace_dict['unimod_name'] if cam and replace_dict[ 'original_aa'] == 'C': unimod_name = replace_dict[ 'unimod_name_with_cam'] new_mod = '{0}:{1}'.format( unimod_name, index_of_U) if line_dict_update['Modifications'] == '': line_dict_update[ 'Modifications'] += new_mod else: line_dict_update[ 'Modifications'] += ';{0}'.format( new_mod) line_dict['Sequence'] = line_dict['Sequence'].replace( aa_to_replace, replace_dict['original_aa']) line_dict_update['Sequence'] = line_dict['Sequence'] # # ^^--------- REPLACED SEQUENCE! ---------------^ # # remove the double ';'' if line_dict_update['Modifications'] != '': tmp = [] for e in line_dict_update['Modifications'].split(';'): if e == '': # that remove the doubles .... continue else: # other way to do it... # pos_of_split_point = re.search( ':\d*\Z', e ) # pattern = re.compile( r''':(?P<pos>[0-9]*$)''' ) for occ, match in enumerate( mod_pattern.finditer(e)): mod = e[:match.start()] mod_pos = e[match.start() + 1:] # mod, pos = e.split(':') m = (int(mod_pos), mod) if m not in tmp: tmp.append(m) tmp.sort() line_dict_update['Modifications'] = ';'.join( ['{m}:{p}'.format(m=mod, p=pos) for pos, mod in tmp]) # calculate m/z cc.use('{Sequence}#{Modifications}'.format(**line_dict_update)) if use15N: number_N = dc(cc['N']) cc['15N'] = number_N del cc['N'] if cam: c_count = line_dict_update['Sequence'].count('C') cc['14N'] = c_count cc['15N'] -= c_count # mass = mass + ( DIFFERENCE_14N_15N * number_N ) mass = cc._mass() calc_mz = ursgal.ucore.calculate_mz(mass, line_dict['Charge']) # mz_buffer[ buffer_key ] = calc_mz line_dict_update['uCalc m/z'] = calc_mz # if 'msamanda' in search_engine.lower(): # ms amanda does not return calculated mz values if line_dict['Calc m/z'] == '': line_dict_update['Calc m/z'] = calc_mz line_dict_update['Accuracy (ppm)'] = \ (float(line_dict['Exp m/z']) - line_dict_update['uCalc m/z'])/line_dict_update['uCalc m/z'] * 1e6 prec_m_accuracy = ( params['translations']['precursor_mass_tolerance_minus'] + params['translations']['precursor_mass_tolerance_plus'] ) / 2 i = 0 while abs( line_dict_update['Accuracy (ppm)']) > prec_m_accuracy: i += 1 if i > len(params['translations'] ['precursor_isotope_range'].split(',')) - 1: break isotope = params['translations'][ 'precursor_isotope_range'].split(',')[i] isotope = int(isotope) if isotope == 0: continue calc_mz = ursgal.ucore.calculate_mz( mass + isotope * 1.008664904, line_dict['Charge']) line_dict_update['Accuracy (ppm)'] = \ (float(line_dict['Exp m/z']) - calc_mz)/calc_mz * 1e6 # ------------ # BUFFER END # ----------- ze_only_buffer[_ze_ultra_buffer_key_] = line_dict_update line_dict_update = ze_only_buffer[_ze_ultra_buffer_key_] line_dict.update(line_dict_update) # protein block, only for database search engine if database_search is True: # remap peptides to proteins, check correct enzymatic # cleavage and decoy assignment lookup_identifier = '{0}><{1}'.format(line_dict['Sequence'], fasta_lookup_name) if lookup_identifier not in pep_map_lookup.keys(): tmp_decoy = set() # tmp_protein_id = {} upeptide_maps = upapa.map_peptide( peptide=line_dict['Sequence'], fasta_name=fasta_lookup_name) ''' <><><><><><><><><><><><><> ''' # assert upeptide_maps != [],''' # The peptide {0} could not be mapped to the # given database {1} # {2} # '''.format( # line_dict['Sequence'], # fasta_lookup_name, # '' # ) if upeptide_maps == []: print(''' [ WARNING ] The peptide {0} could not be mapped to the [ WARNING ] given database {1} [ WARNING ] {2} [ WARNING ] This PSM will be skipped. '''.format(line_dict['Sequence'], fasta_lookup_name, '')) continue sorted_upeptide_maps = [ protein_dict for protein_dict in sorted(upeptide_maps, key=lambda x: x['id']) ] # sorted(bacterial_protein_collector[race].items(),key=lambda x: x[1]['psm_count']) # print() # print(line_dict['Sequence']) # print(sorted_upeptide_maps) protein_mapping_dict = None last_protein_id = None for protein in sorted_upeptide_maps: # print(line_dict) # print(protein) add_protein = False nterm_correct = False cterm_correct = False if params['translations'][ 'keep_asp_pro_broken_peps'] is True: if line_dict['Sequence'][-1] == 'D' and\ protein['post'] == 'P': cterm_correct = True if line_dict['Sequence'][0] == 'P' and\ protein['pre'] == 'D': nterm_correct = True if cleavage_site == 'C': if protein['pre'] in allowed_aa\ or protein['start'] in [1, 2, 3]: if line_dict['Sequence'][0] not in inhibitor_aa\ or protein['start'] in [1, 2, 3]: nterm_correct = True if protein['post'] not in inhibitor_aa: if line_dict['Sequence'][-1] in allowed_aa\ or protein['post'] == '-': cterm_correct = True elif cleavage_site == 'N': if protein['post'] in allowed_aa: if line_dict['Sequence'][-1] not in inhibitor_aa\ or protein['post'] == '-': cterm_correct = True if protein['pre'] not in inhibitor_aa\ or protein['start'] in [1, 2, 3]: if line_dict['Sequence'][0] in allowed_aa\ or protein['start'] in [1, 2, 3]: nterm_correct = True if params['translations']['semi_enzyme'] is True: if cterm_correct is True or nterm_correct is True: add_protein = True elif cterm_correct is True and nterm_correct is True: add_protein = True if add_protein is True: # print(add_protein) # print(cterm_correct, nterm_correct) if protein_mapping_dict is None: protein_mapping_dict = { 'Protein ID': protein['id'], 'Sequence Start': str(protein['start']), 'Sequence Stop': str(protein['end']), 'Sequence Pre AA': protein['pre'], 'Sequence Post AA': protein['post'], } else: if protein['id'] == last_protein_id: tmp_join_char = ';' else: tmp_join_char = joinchar protein_mapping_dict[ 'Protein ID'] += '{0}{1}'.format( tmp_join_char, protein['id']) protein_mapping_dict[ 'Sequence Start'] += '{0}{1}'.format( tmp_join_char, str(protein['start'])) protein_mapping_dict[ 'Sequence Stop'] += '{0}{1}'.format( tmp_join_char, str(protein['end'])) protein_mapping_dict[ 'Sequence Pre AA'] += '{0}{1}'.format( tmp_join_char, protein['pre']) protein_mapping_dict[ 'Sequence Post AA'] += '{0}{1}'.format( tmp_join_char, protein['post']) # print(protein_mapping_dict['Protein ID' ]) last_protein_id = protein['id'] # mzidentml-lib does not always set 'Is decoy' correctly # (it's always 'false' for MS-GF+ results), this is fixed here: if params['translations']['decoy_tag'] in protein[ 'id']: tmp_decoy.add('true') else: tmp_decoy.add('false') if protein_mapping_dict is None: non_enzymatic_peps.add(line_dict['Sequence']) continue if len(protein_mapping_dict['Protein ID']) >= 2000: print('{0}: {1}'.format( line_dict['Sequence'], protein_mapping_dict['Protein ID']), file=protein_id_output) protein_mapping_dict[ 'Protein ID'] = protein_mapping_dict[ 'Protein ID'][:1990] + ' ...' do_not_delete = True if len(tmp_decoy) >= 2: target_decoy_peps.add(line_dict['Sequence']) protein_mapping_dict['Is decoy'] = 'true' else: protein_mapping_dict['Is decoy'] = list(tmp_decoy)[0] pep_map_lookup[lookup_identifier] = protein_mapping_dict buffered_protein_mapping_dict = pep_map_lookup[ lookup_identifier] line_dict.update(buffered_protein_mapping_dict) # count each PSM occurence to check whether row-merging is needed: psm = tuple([line_dict[x] for x in psm_defining_colnames]) psm_counter[psm] += 1 csv_output.writerow(line_dict) ''' to_be_written_csv_lines.append( line_dict ) ''' output_file_object.close() if database_search is True: # upapa.purge_fasta_info( fasta_lookup_name ) if len(non_enzymatic_peps) != 0: print(''' [ WARNING ] The following peptides could not be mapped to the [ WARNING ] given database {0} [ WARNING ] with correct enzymatic cleavage sites: [ WARNING ] {1} [ WARNING ] These PSMs were skipped.'''.format( params['translations']['database'], non_enzymatic_peps)) if len(target_decoy_peps) != 0: print(''' [ WARNING ] The following peptides occured in a target as well as decoy protein [ WARNING ] {0} [ WARNING ] 'Is decoy' has been set to 'True' '''.format( target_decoy_peps, )) # if there are multiple rows for a PSM, we have to merge them aka rewrite the csv... if psm_counter != Counter(): if max(psm_counter.values()) > 1: merge_duplicate_psm_rows( output_file, psm_counter, psm_defining_colnames, params['translations']['psm_merge_delimiter']) ''' to_be_written_csv_lines = merge_duplicate_psm_rows( to_be_written_csv_lines, psm_counter ) ''' ''' do output_file magic with to_be_written_csv_lines ''' if do_not_delete is False: created_tmp_files.append(output_file + '_full_protein_names.txt') return created_tmp_files
def check_hill_notation(aa, chemformula): cc = ursgal.ChemicalComposition(aa) cc.subtract_chemical_formula('H2O') print(aa, chemformula) print(cc) assert cc.hill_notation() == chemformula
def preflight(self): """ Formatting the command line and writing the param input file via self.params Returns: dict: self.params """ self.input_file = os.path.join(self.params["input_dir_path"], self.params["input_file"]) self.param_file_name = os.path.join( self.params["output_dir_path"], "{0}_msfragger.params".format(self.input_file), ) self.created_tmp_files.append(self.param_file_name) # further prepare and translate params # pprint.pprint(self.params['translations']['_grouped_by_translated_key']) # pprint.pprint(self.params) # exit() self.params_to_write = { "output_file_extension": "tsv", # tsv or pepXML we fix it... "output_format": "tsv", # pepXML or tsv "digest_mass_range": "{0} {1}".format( self.params["translations"]["_grouped_by_translated_key"] ["precursor_min_mass"]["precursor_min_mass"], self.params["translations"]["_grouped_by_translated_key"] ["precursor_max_mass"]["precursor_max_mass"], ), } write_exclusion_list = [ "precursor_min_mass", "precursor_max_mass", "precursor_min_charge", "precursor_max_charge", "label", "-Xmx", "header_translations", "validation_score_field", ] additional_15N_modifications = [] if (self.params["translations"]["_grouped_by_translated_key"]["label"] ["label"] == "15N"): self.print_info( "Search with label=15N may still be errorprone. Evaluate with care!", caller="WARNING", ) for aminoacid, N15_Diff in ursgal.ukb.DICT_15N_DIFF.items(): existing = False for mod_dict in self.params["mods"]["fix"]: if aminoacid == mod_dict["aa"]: mod_dict["mass"] += N15_Diff mod_dict["name"] += "_15N_{0}".format(aminoacid) existing = True if existing == True: continue else: mod_key = "add_{0}_{1}".format( aminoacid, ursgal.chemical_composition_kb.aa_names[aminoacid]) self.params_to_write[mod_key] = N15_Diff self.mass_shift_lookup = {} self.mass_glycan_lookup = {} for msfragger_param_name in self.params["translations"][ "_grouped_by_translated_key"].keys(): for ursgal_param_name, param_value in self.params["translations"][ "_grouped_by_translated_key"][msfragger_param_name].items( ): if msfragger_param_name in write_exclusion_list: continue elif msfragger_param_name == "enzyme": """ search_enzyme_name = Trypsin search_enzyme_cutafter = KR search_enzyme_butnotafter = P """ aa_site, term, inhibitor = param_value.split(";") self.params_to_write["search_enzyme_name"] = self.params[ "enzyme"] self.params_to_write["search_enzyme_cutafter"] = aa_site self.params_to_write[ "search_enzyme_butnotafter"] = inhibitor elif msfragger_param_name == "num_enzyme_termini": # num_enzyme_termini = 2 # 2 for enzymatic, 1 for # semi-enzymatic, 0 for nonspecific digestion if (self.params["translations"] ["_grouped_by_translated_key"]["enzyme"]["enzyme"] == "nonspecific"): self.params_to_write[msfragger_param_name] = 0 else: self.params_to_write[ msfragger_param_name] = param_value elif msfragger_param_name == "clear_mz_range": min_mz, max_mz = param_value self.params_to_write[ msfragger_param_name] = "{0} {1}".format( min_mz, max_mz) elif msfragger_param_name == "remove_precursor_range": min_mz, max_mz = param_value self.params_to_write[ msfragger_param_name] = "{0},{1}".format( min_mz, max_mz) elif msfragger_param_name == "delta_mass_exclude_ranges": min_mz, max_mz = param_value self.params_to_write[ msfragger_param_name] = "({0},{1})".format( min_mz, max_mz) elif msfragger_param_name == "precursor_mass_lower": self.params_to_write[ msfragger_param_name] = -1 * param_value elif msfragger_param_name == "modifications": """ #maximum of 7 mods - amino acid codes, * for any amino acid, [ and ] specifies protein termini, n and c specifies peptide termini variable_mod_01 = 15.9949 M variable_mod_02 = 42.0106 [* #variable_mod_03 = 79.96633 STY #variable_mod_03 = -17.0265 nQnC #variable_mod_04 = -18.0106 nE """ # print(self.params['translations']['_grouped_by_translated_key'][msfragger_param_name]) # pprint.pprint(self.params[ 'mods' ]) # exit() mass_to_mod_aa = ddict(list) for mod_dict in self.params["mods"]["opt"]: """ {'_id': 0, 'aa': '*', 'composition': {'C': 2, 'H': 2, 'O': 1}, 'id': '1', 'mass': 42.010565, 'name': 'Acetyl', 'org': '*,opt,Prot-N-term,Acetyl', 'pos': 'Prot-N-term', 'unimod': True}, """ aa_to_append = mod_dict["aa"] pos_modifier = None if mod_dict["pos"] == "Prot-N-term": pos_modifier = "[" elif mod_dict["pos"] == "Prot-C-term": pos_modifier = "]" elif mod_dict["pos"] == "N-term": pos_modifier = "n" elif mod_dict["pos"] == "C-term": pos_modifier = "c" elif mod_dict["pos"] == "any": pass else: print(""" Unknown positional argument for given modification: {0} MSFragger cannot deal with this, please use one of the follwing: any, Prot-N-term, Prot-C-term, N-term, C-term """.format(mod_dict["org"])) sys.exit(1) if pos_modifier is not None: aa_to_append = "{0}{1}".format( pos_modifier, aa_to_append) mass_to_mod_aa[mod_dict["mass"]].append(aa_to_append) for pos, (mass, aa_list) in enumerate(mass_to_mod_aa.items()): self.params_to_write["variable_mod_0{0}".format( pos + 1)] = "{0} {1}".format( mass, "".join(aa_list)) for mod_dict in self.params["mods"]["fix"]: """ add_C_cysteine = 57.021464 # added to C - avg. 103.1429, mono. 103.00918 """ if mod_dict["pos"] == "Prot-N-term": mod_key = "add_Nterm_protein" elif mod_dict["pos"] == "Prot-C-term": mod_key = "add_Cterm_protein" elif mod_dict["pos"] == "N-term": mod_key = "add_Nterm_peptide" elif mod_dict["pos"] == "C-term": mod_key = "add_Cterm_peptide" else: mod_key = "add_{0}_{1}".format( mod_dict["aa"], ursgal.chemical_composition_kb.aa_names[ mod_dict["aa"]], ) self.params_to_write[mod_key] = mod_dict["mass"] elif msfragger_param_name == "override_charge": self.params_to_write[msfragger_param_name] = param_value if param_value == 1: self.params_to_write[ "precursor_charge"] = "{0} {1}".format( self.params["translations"] ["_grouped_by_translated_key"] ["precursor_min_charge"] ["precursor_min_charge"], self.params["translations"] ["_grouped_by_translated_key"] ["precursor_max_charge"] ["precursor_max_charge"], ) elif msfragger_param_name == "fragment_ion_series": ion_list = [] for ion in param_value: if ion not in [ "a", "b", "c", "y~", "x", "y", "z", "b~", "y-18", "b-18", "Y", ]: print(""" [ WARNING ] MSFragger does not allow the following ion: {0} This ion will be skipped, i.e. not included in the search. """.format(ion)) continue ion_list.append(ion) self.params_to_write[msfragger_param_name] = ",".join( ion_list) elif msfragger_param_name in [ "mass_offsets", "Y_type_masses", ]: cc = ursgal.ChemicalComposition() umama = ursgal.UnimodMapper() masses = [] for m in param_value["masses"]: masses.append(str(m)) for m in param_value["glycans"]: cc.clear() cc.add_glycan(m) mass = cc._mass() masses.append(str(mass)) # for tm in self.transform_mass_add_error(mass): tm = round(mass * 1e5) if tm not in self.mass_glycan_lookup.keys(): self.mass_glycan_lookup[tm] = set() self.mass_glycan_lookup[tm].add(m) for m in param_value["chemical_formulas"]: cc.clear() cc.add_chemical_formula(m) mass = cc._mass() masses.append(str(mass)) # for tm in self.transform_mass_add_error(mass): tm = round(mass * 1e5) if tm not in self.mass_shift_lookup.keys(): self.mass_shift_lookup[tm] = set() self.mass_shift_lookup[tm].add(m) for m in param_value["unimods"]: unimod_mass = umama.name2mass(m) masses.append(str(unimod_mass)) # for tm in self.transform_mass_add_error(unimod_mass): tm = round(mass * 1e5) if tm not in self.mass_shift_lookup.keys(): self.mass_shift_lookup[tm] = set() self.mass_shift_lookup[tm].add(m) self.params_to_write[msfragger_param_name] = "/".join( masses) elif msfragger_param_name == "diagnostic_fragments": cc = ursgal.ChemicalComposition() umama = ursgal.UnimodMapper() masses = [] for m in param_value["masses"]: masses.append(m) for m in param_value["glycans"]: cc.clear() cc.add_glycan(m) masses.append(cc._mass()) for m in param_value["chemical_formulas"]: cc.clear() cc.add_chemical_formula(m) masses.append(cc._mass()) for m in param_value["unimods"]: unimod_mass = umama.name2mass(m) masses.append(unimod_mass) mzs = [] for mass in masses: mzs.append(str(ursgal.ucore.calculate_mz(mass, 1))) self.params_to_write[msfragger_param_name] = "/".join(mzs) else: self.params_to_write[msfragger_param_name] = param_value self.write_params_file() if (self.input_file.lower().endswith(".mzml") or self.input_file.lower().endswith(".mzml.gz") or self.input_file.lower().endswith(".mgf")): self.params["translations"]["mzml_input_file"] = self.input_file # elif self.input_file.lower().endswith('.mgf'): # self.params['translations']['mzml_input_file'] = \ # self.meta_unodes['ucontroller'].get_mzml_that_corresponds_to_mgf( self.input_file ) # self.print_info( # 'MSFragger can only read Proteowizard MGF input files,' # 'the corresponding mzML file {0} will be used instead.'.format( # os.path.abspath(self.params['translations']['mzml_input_file']) # ), # caller = "INFO" # ) else: raise Exception( "MSFragger input spectrum file must be in mzML or MGF format!") self.params["command_list"] = [ "java", "-Xmx{0}".format(self.params["translations"] ["_grouped_by_translated_key"]["-Xmx"]["-xmx"]), "-jar", self.exe, self.param_file_name, self.params["translations"]["mzml_input_file"], ] self.params["translations"]["output_file_incl_path"] = os.path.join( self.params["output_dir_path"], self.params["output_file"]) return self.params
def preflight(self): ''' Formatting the command line and writing the param input file via self.params Returns: dict: self.params ''' self.params['translations']['mgf_input_file'] = os.path.join( self.params['input_dir_path'], self.params['input_file']) self.params['translations']['output_file_incl_path'] = os.path.join( self.params['output_dir_path'], self.params['output_file']) self.param_file_name = os.path.join( self.params['translations']['output_file_incl_path'].strip( '.csv') + '_pnovo.param') # self.created_tmp_files.append(self.param_file_name) self.params_to_write = { 'output_dir_path': self.params['output_dir_path'], 'input_file': self.params['translations']['mgf_input_file'], } print(''' [ WARNING ] precursor_mass_tolerance_plus and precursor_mass_tolerance_minus [ WARNING ] need to be combined for pNovo (use of symmetric tolerance window). [ WARNING ] The arithmetic mean is used. ''') self.params['translations']['_grouped_by_translated_key'][ 'pep_tol'] = { 'precursor_mass_tolerance': ( float(self.params['precursor_mass_tolerance_plus']) + \ float(self.params['precursor_mass_tolerance_minus']) ) \ / 2.0 } opt_mods = [] fix_mods = [] self.mod_lookup = {} for pnovo_param_name in self.params['translations'][ '_grouped_by_translated_key'].keys(): for ursgal_param_name, param_value in self.params['translations'][ '_grouped_by_translated_key'][pnovo_param_name].items(): if pnovo_param_name == 'spec_path1': self.params_to_write[pnovo_param_name] = self.params[ 'translations']['mgf_input_file'].replace( '.mgf', '.ms2') self.params_to_write['out_path'] = os.path.dirname( self.params['translations']['output_file_incl_path']) elif pnovo_param_name == 'modifications': #If you want to add a variable modification, #please use a letter from (a-z) instead. #For example, if M+Oxidation is to be added, #you can add the line below(without '#'), #in which 147.0354 = mass(M) + mass(Oxidation) #a=147.0354 #b=160.030654 #N- or C- terminal variable modifications can be added as follows (using 0-9) #c-term=0.984016 #A fixed modification can be added like (without '#'): #C=160.030654 #in which 160.030654 = mass(C) + mass(Carbamidomethyl) #FixMod Carbamidomethyl[C] C # C=160.030654 Carbamidomethyl[C] #VarMod Oxidation[M] M # a=147.035405 Oxidation[M] import string alphabet = [x for x in string.ascii_lowercase] sum_opt_mods = 0 for mod_dict in self.params['mods']['opt']: ''' {'_id': 0, 'aa': '*', 'composition': {'C': 2, 'H': 2, 'O': 1}, 'id': '1', 'mass': 42.010565, 'name': 'Acetyl', 'org': '*,opt,Prot-N-term,Acetyl', 'pos': 'Prot-N-term', 'unimod': True}, ''' if 'Prot' in mod_dict['pos']: print(''' Protein N/C-terminal modifications are not supported by pNovo Please change or delete the following modification: {0} '''.format(mod_dict['org'])) sys.exit(1) elif mod_dict['pos'] == 'N-term': mod_dict['pos'] = 'n-term' elif mod_dict['pos'] == 'C-term': mod_dict['pos'] = 'c-term' elif mod_dict['pos'] == 'any': pass else: print(''' Unknown positional argument for given modification: {0} pGlyco (or Ursgal) cannot deal with this, please use one of the follwing: any, Prot-N-term, Prot-C-term, N-term, C-term '''.format(mod_dict['org'])) sys.exit(1) cc = ursgal.ChemicalComposition() if 'term' in mod_dict['pos']: if mod_dict['aa'] != '*': print(''' Specific amino acids are not supported with terminal modifications in pNovo. Please change or delete the following modification: {0} '''.format(mod_dict['org'])) sys.exit(1) opt_mods.append('{0}={1}'.format( mod_dict['pos'], mod_dict['mass'], )) else: if mod_dict['aa'] == '*': print(''' Not sure how to handle this modification in pNovo: {0} '''.format(mod_dict['org'])) sys.exit(1) cc.use('{0}#{1}:1'.format(mod_dict['aa'], mod_dict['name'])) mod_dict['mass'] = cc._mass() opt_mods.append('{0}={1} {2}[{3}]'.format( alphabet[sum_opt_mods], mod_dict['mass'], mod_dict['name'], mod_dict['aa'], )) self.mod_lookup[alphabet[sum_opt_mods]] = ( mod_dict['name'], mod_dict['aa']) sum_opt_mods += 1 for mod_dict in self.params['mods']['fix']: if 'term' in mod_dict['pos']: print(''' Fixed N/C-terminal modifications are not supported by pNovo Please change or delete the following modification: {0} '''.format(mod_dict['org'])) sys.exit(1) else: cc = ursgal.ChemicalComposition() cc.use('{0}#{1}:1'.format(mod_dict['aa'], mod_dict['name'])) mod_dict['mass'] = cc._mass() opt_mods.append('{0}={1} {2}[{3}]'.format( mod_dict['aa'], mod_dict['mass'], mod_dict['name'], mod_dict['aa'], )) else: self.params_to_write[pnovo_param_name] = param_value self.params_to_write['FixMod'] = '\n'.join(fix_mods) self.params_to_write['VarMod'] = '\n'.join(opt_mods) self.write_params_file() self.params['command_list'] = [ self.exe, self.param_file_name, ] print(' '.join(self.params['command_list'])) return self.params
def preflight(self): """ Formatting the command line and writing the param input file via self.params Returns: dict: self.params """ self.params["translations"]["mgf_input_file"] = os.path.join( self.params["input_dir_path"], self.params["input_file"] ) self.params["translations"]["output_file_incl_path"] = os.path.join( self.params["output_dir_path"], self.params["output_file"] ) self.param_file_name = os.path.join( self.params["translations"]["output_file_incl_path"].strip(".csv") + "_pnovo.param" ) # self.created_tmp_files.append(self.param_file_name) self.params_to_write = { "output_dir_path": self.params["output_dir_path"], "input_file": self.params["translations"]["mgf_input_file"], } print( """ [ WARNING ] precursor_mass_tolerance_plus and precursor_mass_tolerance_minus [ WARNING ] need to be combined for pNovo (use of symmetric tolerance window). [ WARNING ] The arithmetic mean is used. """ ) self.params["translations"]["_grouped_by_translated_key"]["pep_tol"] = { "precursor_mass_tolerance": ( float(self.params["precursor_mass_tolerance_plus"]) + float(self.params["precursor_mass_tolerance_minus"]) ) / 2.0 } opt_mods = [] fix_mods = [] self.mod_lookup = {} for pnovo_param_name in self.params["translations"][ "_grouped_by_translated_key" ].keys(): for ursgal_param_name, param_value in self.params["translations"][ "_grouped_by_translated_key" ][pnovo_param_name].items(): if pnovo_param_name == "spec_path1": self.params_to_write[pnovo_param_name] = self.params[ "translations" ]["mgf_input_file"].replace(".mgf", ".ms2") self.params_to_write["out_path"] = os.path.dirname( self.params["translations"]["output_file_incl_path"] ) elif pnovo_param_name == "modifications": # If you want to add a variable modification, # please use a letter from (a-z) instead. # For example, if M+Oxidation is to be added, # you can add the line below(without '#'), # in which 147.0354 = mass(M) + mass(Oxidation) # a=147.0354 # b=160.030654 # N- or C- terminal variable modifications can be added as follows (using 0-9) # c-term=0.984016 # A fixed modification can be added like (without '#'): # C=160.030654 # in which 160.030654 = mass(C) + mass(Carbamidomethyl) # FixMod Carbamidomethyl[C] C # C=160.030654 Carbamidomethyl[C] # VarMod Oxidation[M] M # a=147.035405 Oxidation[M] import string alphabet = [x for x in string.ascii_lowercase] sum_opt_mods = 0 for mod_dict in self.params["mods"]["opt"]: """ {'_id': 0, 'aa': '*', 'composition': {'C': 2, 'H': 2, 'O': 1}, 'id': '1', 'mass': 42.010565, 'name': 'Acetyl', 'org': '*,opt,Prot-N-term,Acetyl', 'pos': 'Prot-N-term', 'unimod': True}, """ if "Prot" in mod_dict["pos"]: print( """ Protein N/C-terminal modifications are not supported by pNovo Please change or delete the following modification: {0} """.format( mod_dict["org"] ) ) sys.exit(1) elif mod_dict["pos"] == "N-term": mod_dict["pos"] = "n-term" elif mod_dict["pos"] == "C-term": mod_dict["pos"] = "c-term" elif mod_dict["pos"] == "any": pass else: print( """ Unknown positional argument for given modification: {0} pGlyco (or Ursgal) cannot deal with this, please use one of the follwing: any, Prot-N-term, Prot-C-term, N-term, C-term """.format( mod_dict["org"] ) ) sys.exit(1) cc = ursgal.ChemicalComposition() if "term" in mod_dict["pos"]: if mod_dict["aa"] != "*": print( """ Specific amino acids are not supported with terminal modifications in pNovo. Please change or delete the following modification: {0} """.format( mod_dict["org"] ) ) sys.exit(1) opt_mods.append( "{0}={1}".format( mod_dict["pos"], mod_dict["mass"], ) ) else: if mod_dict["aa"] == "*": print( """ Not sure how to handle this modification in pNovo: {0} """.format( mod_dict["org"] ) ) sys.exit(1) cc.use("{0}#{1}:1".format(mod_dict["aa"], mod_dict["name"])) mod_dict["mass"] = cc._mass() opt_mods.append( "{0}={1} {2}[{3}]".format( alphabet[sum_opt_mods], mod_dict["mass"], mod_dict["name"], mod_dict["aa"], ) ) self.mod_lookup[alphabet[sum_opt_mods]] = ( mod_dict["name"], mod_dict["aa"], ) sum_opt_mods += 1 for mod_dict in self.params["mods"]["fix"]: if "term" in mod_dict["pos"]: print( """ Fixed N/C-terminal modifications are not supported by pNovo Please change or delete the following modification: {0} """.format( mod_dict["org"] ) ) sys.exit(1) else: cc = ursgal.ChemicalComposition() cc.use("{0}#{1}:1".format(mod_dict["aa"], mod_dict["name"])) mod_dict["mass"] = cc._mass() opt_mods.append( "{0}={1} {2}[{3}]".format( mod_dict["aa"], mod_dict["mass"], mod_dict["name"], mod_dict["aa"], ) ) else: self.params_to_write[pnovo_param_name] = param_value self.params_to_write["FixMod"] = "\n".join(fix_mods) self.params_to_write["VarMod"] = "\n".join(opt_mods) self.write_params_file() self.params["command_list"] = [ self.exe, self.param_file_name, ] print(" ".join(self.params["command_list"])) return self.params
def build_combinations( self, max_tree_length=None, monosaccharides=None, mode='replacement', ): ''' Builds and returns a dictionary containing chemical compositions of all combinations (with replacement, not ordered) of a given dict of monosaccharides and a maximal length of the tree. Keyword arguments: max_tree_length (int): Maximum number of monosaccharides in one combination monosaccharides(dict): Dictionary containing name and chemical composition of monosaccharides Returns: dict: keys: chemical compositions of all combinations (with replacement, not ordered), values: combination(s) monosaccharide names corresponding to the chemical composition ToDo: change monosaccharides to list and get compositions from ursgal.ChemicalComposition(), keyword argument for calculate_formula? ''' if monosaccharides is None: monosaccharides = self.monosaccharides if mode == 'replacement': print('[ SugarPy ] Building combinations for:') print( '[ SugarPy ]', len(monosaccharides), 'given monosaccharides and a max tree length of', max_tree_length ) # mode = 'sugarqb' # sugarqb_glycan_db = open('sugarqb_glycan_db.txt', 'w') glycan_combinations = {} for nr_repeats in range(1, max_tree_length + 1): if mode == 'combinations': glycan_combinations[nr_repeats] = [] tmp_combinations = set() for combo in combinations(monosaccharides, nr_repeats): tmp_combinations.add(combo) for tmp_combo in tmp_combinations: glycan_dict = {} for monosacch in set(tmp_combo): count = tmp_combo.count(monosacch) glycan_dict[monosacch] = count glycan_combinations[nr_repeats].append(glycan_dict) elif mode == 'replacement': for combo in combinations_with_replacement(monosaccharides, nr_repeats): cc = ursgal.ChemicalComposition() for monosacch in combo: cc.add_chemical_formula(monosaccharides[monosacch]) hill_notation = cc.hill_notation_unimod() if hill_notation not in glycan_combinations: glycan_combinations[hill_notation] = set() glycan_combinations[hill_notation].add(combo) elif mode == 'sugarqb': for combo in combinations_with_replacement(monosaccharides, nr_repeats): cc = ursgal.ChemicalComposition() glycan_dict = {} print(combo) for monosacch in combo: cc.add_glycan(monosacch) count = combo.count(monosacch) glycan_dict[monosacch] = count sugarqb_list = [] for monosacch, count in glycan_dict.items(): sugarqb_list.append('{0}{1}'.format(monosacch, count)) sugarqb_str = '{0}_N-Glycan {1}'.format(','.join(sorted(sugarqb_list)), round(cc._mass(), 6)) print(sugarqb_str, file=sugarqb_glycan_db) else: print(''' [ SugarPy ] ERROR! the mode for build_combinations [ SugarPy ] is not available: {0} '''.format(mode)) if mode == 'replacement': print('[ SugarPy ] built', len(glycan_combinations), 'combinations') return glycan_combinations
def add_glycans2peptide( self, peptide_list=[], max_tree_length=None, monosaccharides=None ): ''' Adds chemical composition of glycans to a given list of peptides. Peptides need to be in unimod style (Peptide#Modifications). The chemical composition of the original peptidoform is returned as well. Keyword Arguments: peptide_list (list): List of peptides in unimod style max_tree_length (int): maximum number of monosaccharides in one combination monosaccharides(dict): dictionary containing name and chemical composition of monosaccharides Returns: dict: { 'Sequence#Modifications : {glycan_hill_notation': ['Name']}} ''' if monosaccharides is None: monosaccharides == self.monosaccharides monosacch_combinations = self.build_combinations( max_tree_length=max_tree_length, monosaccharides=monosaccharides ) peptides_with_glycans = {} for peptide in peptide_list: if peptide in peptides_with_glycans.keys(): continue peptides_with_glycans[peptide] = {} # pep_with_glycans = {} cc = ursgal.ChemicalComposition() cc.use(peptide) hill_notation = cc.hill_notation_unimod() peptides_with_glycans[peptide][hill_notation] = [ peptide] # ['{0}'.format(peptide)] for composition in monosacch_combinations.keys(): cc.add_chemical_formula(composition) hill_notation = cc.hill_notation_unimod() for combo in monosacch_combinations[composition]: combo_name = '' combo_dict = {} for monosacch in combo: if monosacch not in combo_dict.keys(): combo_dict[monosacch] = 0 combo_dict[monosacch] += 1 for monosacch in sorted(combo_dict.keys()): combo_name += '{0}({1})'.format( monosacch, combo_dict[monosacch] ) if hill_notation not in peptides_with_glycans[peptide].keys(): peptides_with_glycans[peptide][hill_notation] = [] peptides_with_glycans[peptide][hill_notation].append( '{0}|{1}'.format(peptide, combo_name) ) cc.subtract_chemical_formula(composition) print('[ SugarPy ] Added glycans to peptides.') return peptides_with_glycans