def use(self, sequence): '''Re-initialize the class with a new sequence This is helpful if one wants to use the same class instance for multiple sequence since it remove class instantiation overhead. Args: sequence (str): See top for possible input formats. ''' self.clear() # reset the shiznit if '#' in sequence: # Unimod Style format if self._unimod_parser is None: self._unimod_parser = ursgal.UnimodMapper() self._parse_sequence_unimod_style(sequence) else: self._parse_sequence_old_style(sequence)
def preflight(self): ''' Formatting the command line and writing the param input file via self.params Returns: dict: self.params ''' self.input_file = os.path.join(self.params['input_dir_path'], self.params['input_file']) self.param_file_name = os.path.join( self.params['output_dir_path'], '{0}_msfragger.params'.format(self.input_file)) self.created_tmp_files.append(self.param_file_name) # further prepare and translate params # pprint.pprint(self.params['translations']['_grouped_by_translated_key']) # pprint.pprint(self.params) # exit() self.params_to_write = { 'output_file_extension': 'tsv', # tsv or pepXML we fix it... 'output_format': 'tsv', # pepXML or tsv 'digest_mass_range': '{0} {1}'.format( self.params['translations']['_grouped_by_translated_key'] ['precursor_min_mass']['precursor_min_mass'], self.params['translations']['_grouped_by_translated_key'] ['precursor_max_mass']['precursor_max_mass']) } write_exclusion_list = [ 'precursor_min_mass', 'precursor_max_mass', 'precursor_min_charge', 'precursor_max_charge', 'label', '-Xmx', 'header_translations', 'validation_score_field' ] additional_15N_modifications = [] if self.params['translations']['_grouped_by_translated_key']['label'][ 'label'] == '15N': self.print_info( 'Search with label=15N may still be errorprone. Evaluate with care!', caller='WARNING') for aminoacid, N15_Diff in ursgal.ukb.DICT_15N_DIFF.items(): existing = False for mod_dict in self.params['mods']['fix']: if aminoacid == mod_dict['aa']: mod_dict['mass'] += N15_Diff mod_dict['name'] += '_15N_{0}'.format(aminoacid) existing = True if existing == True: continue else: mod_key = 'add_{0}_{1}'.format( aminoacid, ursgal.chemical_composition_kb.aa_names[aminoacid]) self.params_to_write[mod_key] = N15_Diff self.mass_shift_lookup = {} self.mass_glycan_lookup = {} for msfragger_param_name in self.params['translations'][ '_grouped_by_translated_key'].keys(): for ursgal_param_name, param_value in self.params['translations'][ '_grouped_by_translated_key'][msfragger_param_name].items( ): if msfragger_param_name in write_exclusion_list: continue elif msfragger_param_name == 'enzyme': ''' search_enzyme_name = Trypsin search_enzyme_cutafter = KR search_enzyme_butnotafter = P ''' aa_site, term, inhibitor = param_value.split(';') self.params_to_write['search_enzyme_name'] = self.params[ 'enzyme'] self.params_to_write['search_enzyme_cutafter'] = aa_site self.params_to_write[ 'search_enzyme_butnotafter'] = inhibitor elif msfragger_param_name == 'num_enzyme_termini': # num_enzyme_termini = 2 # 2 for enzymatic, 1 for # semi-enzymatic, 0 for nonspecific digestion if self.params['translations'][ '_grouped_by_translated_key']['enzyme'][ 'enzyme'] == 'nonspecific': self.params_to_write[msfragger_param_name] = 0 else: self.params_to_write[ msfragger_param_name] = param_value elif msfragger_param_name == 'clear_mz_range': min_mz, max_mz = param_value self.params_to_write[ msfragger_param_name] = '{0} {1}'.format( min_mz, max_mz) elif msfragger_param_name == 'remove_precursor_range': min_mz, max_mz = param_value self.params_to_write[ msfragger_param_name] = '{0},{1}'.format( min_mz, max_mz) elif msfragger_param_name == 'delta_mass_exclude_ranges': min_mz, max_mz = param_value self.params_to_write[ msfragger_param_name] = '({0},{1})'.format( min_mz, max_mz) elif msfragger_param_name == 'precursor_mass_lower': self.params_to_write[ msfragger_param_name] = -1 * param_value elif msfragger_param_name == 'modifications': ''' #maximum of 7 mods - amino acid codes, * for any amino acid, [ and ] specifies protein termini, n and c specifies peptide termini variable_mod_01 = 15.9949 M variable_mod_02 = 42.0106 [* #variable_mod_03 = 79.96633 STY #variable_mod_03 = -17.0265 nQnC #variable_mod_04 = -18.0106 nE ''' # print(self.params['translations']['_grouped_by_translated_key'][msfragger_param_name]) # pprint.pprint(self.params[ 'mods' ]) # exit() mass_to_mod_aa = ddict(list) for mod_dict in self.params['mods']['opt']: ''' {'_id': 0, 'aa': '*', 'composition': {'C': 2, 'H': 2, 'O': 1}, 'id': '1', 'mass': 42.010565, 'name': 'Acetyl', 'org': '*,opt,Prot-N-term,Acetyl', 'pos': 'Prot-N-term', 'unimod': True}, ''' aa_to_append = mod_dict['aa'] pos_modifier = None if mod_dict['pos'] == 'Prot-N-term': pos_modifier = '[' elif mod_dict['pos'] == 'Prot-C-term': pos_modifier = ']' elif mod_dict['pos'] == 'N-term': pos_modifier = 'n' elif mod_dict['pos'] == 'C-term': pos_modifier = 'c' elif mod_dict['pos'] == 'any': pass else: print(''' Unknown positional argument for given modification: {0} MSFragger cannot deal with this, please use one of the follwing: any, Prot-N-term, Prot-C-term, N-term, C-term '''.format(mod_dict['org'])) sys.exit(1) if pos_modifier is not None: aa_to_append = '{0}{1}'.format( pos_modifier, aa_to_append) mass_to_mod_aa[mod_dict['mass']].append(aa_to_append) for pos, (mass, aa_list) in enumerate(mass_to_mod_aa.items()): self.params_to_write['variable_mod_0{0}'.format( pos + 1)] = '{0} {1}'.format( mass, ''.join(aa_list)) for mod_dict in self.params['mods']['fix']: ''' add_C_cysteine = 57.021464 # added to C - avg. 103.1429, mono. 103.00918 ''' if mod_dict['pos'] == 'Prot-N-term': mod_key = 'add_Nterm_protein' elif mod_dict['pos'] == 'Prot-C-term': mod_key = 'add_Cterm_protein' elif mod_dict['pos'] == 'N-term': mod_key = 'add_Nterm_peptide' elif mod_dict['pos'] == 'C-term': mod_key = 'add_Cterm_peptide' else: mod_key = 'add_{0}_{1}'.format( mod_dict['aa'], ursgal.chemical_composition_kb.aa_names[ mod_dict['aa']]) self.params_to_write[mod_key] = mod_dict['mass'] elif msfragger_param_name == 'override_charge': self.params_to_write[msfragger_param_name] = param_value if param_value == 1: self.params_to_write[ 'precursor_charge'] = '{0} {1}'.format( self.params['translations'] ['_grouped_by_translated_key'] ['precursor_min_charge'] ['precursor_min_charge'], self.params['translations'] ['_grouped_by_translated_key'] ['precursor_max_charge'] ['precursor_max_charge']) elif msfragger_param_name == 'fragment_ion_series': ion_list = [] for ion in param_value: if ion not in [ 'a', 'b', 'c', 'y~', 'x', 'y', 'z', 'b~', 'y-18', 'b-18', 'Y', ]: print(''' [ WARNING ] MSFragger does not allow the following ion: {0} This ion will be skipped, i.e. not included in the search. '''.format(ion)) continue ion_list.append(ion) self.params_to_write[msfragger_param_name] = ','.join( ion_list) elif msfragger_param_name in [ 'mass_offsets', 'Y_type_masses', ]: cc = ursgal.ChemicalComposition() umama = ursgal.UnimodMapper() masses = [] for m in param_value['masses']: masses.append(str(m)) for m in param_value['glycans']: cc.clear() cc.add_glycan(m) mass = cc._mass() masses.append(str(mass)) # for tm in self.transform_mass_add_error(mass): tm = round(mass * 1e5) if tm not in self.mass_glycan_lookup.keys(): self.mass_glycan_lookup[tm] = set() self.mass_glycan_lookup[tm].add(m) for m in param_value['chemical_formulas']: cc.clear() cc.add_chemical_formula(m) mass = cc._mass() masses.append(str(mass)) # for tm in self.transform_mass_add_error(mass): tm = round(mass * 1e5) if tm not in self.mass_shift_lookup.keys(): self.mass_shift_lookup[tm] = set() self.mass_shift_lookup[tm].add(m) for m in param_value['unimods']: unimod_mass = umama.name2mass(m) masses.append(str(unimod_mass)) # for tm in self.transform_mass_add_error(unimod_mass): tm = round(mass * 1e5) if tm not in self.mass_shift_lookup.keys(): self.mass_shift_lookup[tm] = set() self.mass_shift_lookup[tm].add(m) self.params_to_write[msfragger_param_name] = '/'.join( masses) elif msfragger_param_name == 'diagnostic_fragments': cc = ursgal.ChemicalComposition() umama = ursgal.UnimodMapper() masses = [] for m in param_value['masses']: masses.append(m) for m in param_value['glycans']: cc.clear() cc.add_glycan(m) masses.append(cc._mass()) for m in param_value['chemical_formulas']: cc.clear() cc.add_chemical_formula(m) masses.append(cc._mass()) for m in param_value['unimods']: unimod_mass = umama.name2mass(m) masses.append(unimod_mass) mzs = [] for mass in masses: mzs.append(str(ursgal.ucore.calculate_mz(mass, 1))) self.params_to_write[msfragger_param_name] = '/'.join(mzs) else: self.params_to_write[msfragger_param_name] = param_value self.write_params_file() if self.input_file.lower().endswith('.mzml') or \ self.input_file.lower().endswith('.mzml.gz') or \ self.input_file.lower().endswith('.mgf'): self.params['translations']['mzml_input_file'] = self.input_file # elif self.input_file.lower().endswith('.mgf'): # self.params['translations']['mzml_input_file'] = \ # self.meta_unodes['ucontroller'].get_mzml_that_corresponds_to_mgf( self.input_file ) # self.print_info( # 'MSFragger can only read Proteowizard MGF input files,' # 'the corresponding mzML file {0} will be used instead.'.format( # os.path.abspath(self.params['translations']['mzml_input_file']) # ), # caller = "INFO" # ) else: raise Exception( 'MSFragger input spectrum file must be in mzML or MGF format!') self.params['command_list'] = [ 'java', '-Xmx{0}'.format(self.params['translations'] ['_grouped_by_translated_key']['-Xmx']['-xmx']), '-jar', self.exe, self.param_file_name, self.params['translations']['mzml_input_file'] ] self.params['translations']['output_file_incl_path'] = os.path.join( self.params['output_dir_path'], self.params['output_file']) return self.params
def preflight(self): """ Formatting the command line and writing the param input file via self.params Returns: dict: self.params """ self.input_file = os.path.join(self.params["input_dir_path"], self.params["input_file"]) self.param_file_name = os.path.join( self.params["output_dir_path"], "{0}_msfragger.params".format(self.input_file), ) self.created_tmp_files.append(self.param_file_name) # further prepare and translate params # pprint.pprint(self.params['translations']['_grouped_by_translated_key']) # pprint.pprint(self.params) # exit() self.params_to_write = { "output_file_extension": "tsv", # tsv or pepXML we fix it... "output_format": "tsv", # pepXML or tsv "digest_mass_range": "{0} {1}".format( self.params["translations"]["_grouped_by_translated_key"] ["precursor_min_mass"]["precursor_min_mass"], self.params["translations"]["_grouped_by_translated_key"] ["precursor_max_mass"]["precursor_max_mass"], ), } write_exclusion_list = [ "precursor_min_mass", "precursor_max_mass", "precursor_min_charge", "precursor_max_charge", "label", "-Xmx", "header_translations", "validation_score_field", ] additional_15N_modifications = [] if (self.params["translations"]["_grouped_by_translated_key"]["label"] ["label"] == "15N"): self.print_info( "Search with label=15N may still be errorprone. Evaluate with care!", caller="WARNING", ) for aminoacid, N15_Diff in ursgal.ukb.DICT_15N_DIFF.items(): existing = False for mod_dict in self.params["mods"]["fix"]: if aminoacid == mod_dict["aa"]: mod_dict["mass"] += N15_Diff mod_dict["name"] += "_15N_{0}".format(aminoacid) existing = True if existing == True: continue else: mod_key = "add_{0}_{1}".format( aminoacid, ursgal.chemical_composition_kb.aa_names[aminoacid]) self.params_to_write[mod_key] = N15_Diff self.mass_shift_lookup = {} self.mass_glycan_lookup = {} for msfragger_param_name in self.params["translations"][ "_grouped_by_translated_key"].keys(): for ursgal_param_name, param_value in self.params["translations"][ "_grouped_by_translated_key"][msfragger_param_name].items( ): if msfragger_param_name in write_exclusion_list: continue elif msfragger_param_name == "enzyme": """ search_enzyme_name = Trypsin search_enzyme_cutafter = KR search_enzyme_butnotafter = P """ aa_site, term, inhibitor = param_value.split(";") self.params_to_write["search_enzyme_name"] = self.params[ "enzyme"] self.params_to_write["search_enzyme_cutafter"] = aa_site self.params_to_write[ "search_enzyme_butnotafter"] = inhibitor elif msfragger_param_name == "num_enzyme_termini": # num_enzyme_termini = 2 # 2 for enzymatic, 1 for # semi-enzymatic, 0 for nonspecific digestion if (self.params["translations"] ["_grouped_by_translated_key"]["enzyme"]["enzyme"] == "nonspecific"): self.params_to_write[msfragger_param_name] = 0 else: self.params_to_write[ msfragger_param_name] = param_value elif msfragger_param_name == "clear_mz_range": min_mz, max_mz = param_value self.params_to_write[ msfragger_param_name] = "{0} {1}".format( min_mz, max_mz) elif msfragger_param_name == "remove_precursor_range": min_mz, max_mz = param_value self.params_to_write[ msfragger_param_name] = "{0},{1}".format( min_mz, max_mz) elif msfragger_param_name == "delta_mass_exclude_ranges": min_mz, max_mz = param_value self.params_to_write[ msfragger_param_name] = "({0},{1})".format( min_mz, max_mz) elif msfragger_param_name == "precursor_mass_lower": self.params_to_write[ msfragger_param_name] = -1 * param_value elif msfragger_param_name == "modifications": """ #maximum of 7 mods - amino acid codes, * for any amino acid, [ and ] specifies protein termini, n and c specifies peptide termini variable_mod_01 = 15.9949 M variable_mod_02 = 42.0106 [* #variable_mod_03 = 79.96633 STY #variable_mod_03 = -17.0265 nQnC #variable_mod_04 = -18.0106 nE """ # print(self.params['translations']['_grouped_by_translated_key'][msfragger_param_name]) # pprint.pprint(self.params[ 'mods' ]) # exit() mass_to_mod_aa = ddict(list) for mod_dict in self.params["mods"]["opt"]: """ {'_id': 0, 'aa': '*', 'composition': {'C': 2, 'H': 2, 'O': 1}, 'id': '1', 'mass': 42.010565, 'name': 'Acetyl', 'org': '*,opt,Prot-N-term,Acetyl', 'pos': 'Prot-N-term', 'unimod': True}, """ aa_to_append = mod_dict["aa"] pos_modifier = None if mod_dict["pos"] == "Prot-N-term": pos_modifier = "[" elif mod_dict["pos"] == "Prot-C-term": pos_modifier = "]" elif mod_dict["pos"] == "N-term": pos_modifier = "n" elif mod_dict["pos"] == "C-term": pos_modifier = "c" elif mod_dict["pos"] == "any": pass else: print(""" Unknown positional argument for given modification: {0} MSFragger cannot deal with this, please use one of the follwing: any, Prot-N-term, Prot-C-term, N-term, C-term """.format(mod_dict["org"])) sys.exit(1) if pos_modifier is not None: aa_to_append = "{0}{1}".format( pos_modifier, aa_to_append) mass_to_mod_aa[mod_dict["mass"]].append(aa_to_append) for pos, (mass, aa_list) in enumerate(mass_to_mod_aa.items()): self.params_to_write["variable_mod_0{0}".format( pos + 1)] = "{0} {1}".format( mass, "".join(aa_list)) for mod_dict in self.params["mods"]["fix"]: """ add_C_cysteine = 57.021464 # added to C - avg. 103.1429, mono. 103.00918 """ if mod_dict["pos"] == "Prot-N-term": mod_key = "add_Nterm_protein" elif mod_dict["pos"] == "Prot-C-term": mod_key = "add_Cterm_protein" elif mod_dict["pos"] == "N-term": mod_key = "add_Nterm_peptide" elif mod_dict["pos"] == "C-term": mod_key = "add_Cterm_peptide" else: mod_key = "add_{0}_{1}".format( mod_dict["aa"], ursgal.chemical_composition_kb.aa_names[ mod_dict["aa"]], ) self.params_to_write[mod_key] = mod_dict["mass"] elif msfragger_param_name == "override_charge": self.params_to_write[msfragger_param_name] = param_value if param_value == 1: self.params_to_write[ "precursor_charge"] = "{0} {1}".format( self.params["translations"] ["_grouped_by_translated_key"] ["precursor_min_charge"] ["precursor_min_charge"], self.params["translations"] ["_grouped_by_translated_key"] ["precursor_max_charge"] ["precursor_max_charge"], ) elif msfragger_param_name == "fragment_ion_series": ion_list = [] for ion in param_value: if ion not in [ "a", "b", "c", "y~", "x", "y", "z", "b~", "y-18", "b-18", "Y", ]: print(""" [ WARNING ] MSFragger does not allow the following ion: {0} This ion will be skipped, i.e. not included in the search. """.format(ion)) continue ion_list.append(ion) self.params_to_write[msfragger_param_name] = ",".join( ion_list) elif msfragger_param_name in [ "mass_offsets", "Y_type_masses", ]: cc = ursgal.ChemicalComposition() umama = ursgal.UnimodMapper() masses = [] for m in param_value["masses"]: masses.append(str(m)) for m in param_value["glycans"]: cc.clear() cc.add_glycan(m) mass = cc._mass() masses.append(str(mass)) # for tm in self.transform_mass_add_error(mass): tm = round(mass * 1e5) if tm not in self.mass_glycan_lookup.keys(): self.mass_glycan_lookup[tm] = set() self.mass_glycan_lookup[tm].add(m) for m in param_value["chemical_formulas"]: cc.clear() cc.add_chemical_formula(m) mass = cc._mass() masses.append(str(mass)) # for tm in self.transform_mass_add_error(mass): tm = round(mass * 1e5) if tm not in self.mass_shift_lookup.keys(): self.mass_shift_lookup[tm] = set() self.mass_shift_lookup[tm].add(m) for m in param_value["unimods"]: unimod_mass = umama.name2mass(m) masses.append(str(unimod_mass)) # for tm in self.transform_mass_add_error(unimod_mass): tm = round(mass * 1e5) if tm not in self.mass_shift_lookup.keys(): self.mass_shift_lookup[tm] = set() self.mass_shift_lookup[tm].add(m) self.params_to_write[msfragger_param_name] = "/".join( masses) elif msfragger_param_name == "diagnostic_fragments": cc = ursgal.ChemicalComposition() umama = ursgal.UnimodMapper() masses = [] for m in param_value["masses"]: masses.append(m) for m in param_value["glycans"]: cc.clear() cc.add_glycan(m) masses.append(cc._mass()) for m in param_value["chemical_formulas"]: cc.clear() cc.add_chemical_formula(m) masses.append(cc._mass()) for m in param_value["unimods"]: unimod_mass = umama.name2mass(m) masses.append(unimod_mass) mzs = [] for mass in masses: mzs.append(str(ursgal.ucore.calculate_mz(mass, 1))) self.params_to_write[msfragger_param_name] = "/".join(mzs) else: self.params_to_write[msfragger_param_name] = param_value self.write_params_file() if (self.input_file.lower().endswith(".mzml") or self.input_file.lower().endswith(".mzml.gz") or self.input_file.lower().endswith(".mgf")): self.params["translations"]["mzml_input_file"] = self.input_file # elif self.input_file.lower().endswith('.mgf'): # self.params['translations']['mzml_input_file'] = \ # self.meta_unodes['ucontroller'].get_mzml_that_corresponds_to_mgf( self.input_file ) # self.print_info( # 'MSFragger can only read Proteowizard MGF input files,' # 'the corresponding mzML file {0} will be used instead.'.format( # os.path.abspath(self.params['translations']['mzml_input_file']) # ), # caller = "INFO" # ) else: raise Exception( "MSFragger input spectrum file must be in mzML or MGF format!") self.params["command_list"] = [ "java", "-Xmx{0}".format(self.params["translations"] ["_grouped_by_translated_key"]["-Xmx"]["-xmx"]), "-jar", self.exe, self.param_file_name, self.params["translations"]["mzml_input_file"], ] self.params["translations"]["output_file_incl_path"] = os.path.join( self.params["output_dir_path"], self.params["output_file"]) return self.params
def write_input_tsv(self, input_csv, tmp_dir): ''' convert ursgal csv into PTM-Shepherd input tsv (same format as Philosopher output psms tsv) ''' print('[ PREFLGHT ] writing PTM-Shepherd config file ...') ptmshep_input = os.path.join( tmp_dir, os.path.basename(input_csv).replace('.csv', '.tsv')) # mod pattern mod_pattern = re.compile(r'''(?P<name>.*):(?P<pos>[0-9]*$)''') #convert csv file into PTM-Shepherd input file fieldnames_list = [ 'Spectrum', 'Peptide', 'Modified Peptide', 'Peptide Length', 'Charge', 'Retention', 'Calculated Peptide Mass', 'Calculated M/Z', 'Delta Mass', 'Assigned Modifications', 'Is Unique', ] umama = ursgal.UnimodMapper() scan_rt_lookup_path = os.path.join( os.path.dirname( self.params['translations']['mzml_input_files'][0]), '_ursgal_lookup.pkl') # with open(scan_rt_lookup_path, 'rb') as scan_rt_in: # scan_rt_lookup_dict = pickle.load(scan_rt_in) with open(ptmshep_input, 'w', newline='') as new_csvfile, \ open(self.params['translations']['csv_input_file'], 'r') as csv_file: writer = csv.DictWriter(new_csvfile, fieldnames=fieldnames_list, delimiter='\t') writer.writeheader() csv_reader = csv.DictReader(csv_file) for n, row in enumerate(csv_reader): if n % 500 == 0: print('[ PREFLGHT ] Processing line number: {0}'.format(n), end='\r') mass_diffs = row['Mass Difference'].split(';') mass_diffs_sum = 0.0 for n, mass in enumerate(mass_diffs): if mass == '': continue if 'moda' in row['Search Engine'] and mass.startswith('+'): exp_mass = ursgal.ucore.calculate_mass( float(row['Exp m/z']), int(row['Charge'])) mass = '{}'.format(exp_mass - float(row['uCalc Mass'])) mass_diffs_sum += float(mass.split(':')[0]) if '<|>' in row['Protein ID']: is_unique = 'false' else: is_unique = 'true' rt = row.get('Retention Time (s)', '') if rt == '': spectrum_id = int(row['Spectrum ID']) raw_file_name = os.path.basename(row['Raw data location']) input_file_basename_for_rt_lookup = raw_file_name.replace( '.mgf', '') retention_time_in_minutes = \ scan_rt_lookup_dict[input_file_basename_for_rt_lookup]['scan_2_rt']\ [spectrum_id] row['Retention Time (s)'] = retention_time_in_minutes * 60 assert row['Retention Time (s)'] != '', ''' [ERROR] Retention Time needs to be given for each row. ''' tmp_dict = {} tmp_dict['Spectrum'] = row['Spectrum Title'] tmp_dict['Peptide'] = row['Sequence'] tmp_dict['Modified Peptide'] = row['Sequence'] tmp_dict['Peptide Length'] = len(row['Sequence']) tmp_dict['Charge'] = row['Charge'] tmp_dict['Retention'] = row['Retention Time (s)'] tmp_dict['Calculated Peptide Mass'] = row['uCalc Mass'] tmp_dict['Calculated M/Z'] = row['uCalc m/z'] tmp_dict['Delta Mass'] = mass_diffs_sum tmp_dict['Is Unique'] = is_unique mods = row['Modifications'].split(';') new_mod_list = [] for mod in mods: if mod == '': continue match = mod_pattern.search(mod) pos = int(match.group('pos')) if pos == 0: aa = row['Sequence'][pos] else: aa = row['Sequence'][pos - 1] mass = umama.name2mass(match.group('name')) mod_entry = '{0}{1}({2})'.format(pos, aa, mass) new_mod_list.append(mod_entry) tmp_dict['Assigned Modifications'] = ', '.join(new_mod_list) writer.writerow(tmp_dict) print('[ PREFLGHT ] Processing done') return ptmshep_input
def main(input_file=None, output_file=None, score_column_name=None, score_type=None): ''' Convert csvs to ssl ''' umama = ursgal.UnimodMapper() csv_kwargs = {} if sys.platform == 'win32': csv_kwargs['lineterminator'] = '\n' else: csv_kwargs['lineterminator'] = '\r\n' output_file_object = open(output_file, 'w') new_fieldnames = [ 'file', 'scan', 'charge', 'sequence', 'score-type', 'score', 'retention-time' ] with open(input_file, 'r') as in_file: csv_input = csv.DictReader(in_file) csv_output = csv.DictWriter(output_file_object, new_fieldnames, delimiter='\t', **csv_kwargs) csv_output.writeheader() for csv_line_dict in csv_input: rt = round(float(csv_line_dict['Retention Time (s)']) / 60, 10) sequence = csv_line_dict['Sequence'] mods = csv_line_dict['Modifications'].split(';') pattern = re.compile(r''':(?P<pos>[0-9]*$)''') pos2mass = {} for mod in mods: if mod == '': continue if ':' not in mod: sys.exit( 'This unimod: {0} requires positional information'. format(mod)) for occ, match in enumerate(pattern.finditer(mod)): try: unimod_mass = umama.name2mass(mod[:match.start()]) except: sys.exit( 'Can not map unimod {0}. extracted position argument: {1}' .format(mod, match.start())) position = int(match.group('pos')) if position == 0: position = 1 elif position > len(sequence): position = len(sequence) if position not in pos2mass: pos2mass[position] = 0 pos2mass[position] += unimod_mass if occ >= 1: sys.exit( 'Incorrect regex pattern for mod: {0}'.format(mod)) seq_incl_mods = '' for p, aa in enumerate(sequence): seq_incl_mods += aa if p + 1 in pos2mass: mass = pos2mass[p + 1] if mass >= 0: seq_incl_mods += '[+{0}]'.format(mass) else: seq_incl_mods += '[{0}]'.format(mass) ssl_line_dict = {} ssl_line_dict['file'] = csv_line_dict['Raw data location'] ssl_line_dict['scan'] = csv_line_dict['Spectrum ID'] ssl_line_dict['charge'] = csv_line_dict['Charge'] ssl_line_dict['sequence'] = seq_incl_mods ssl_line_dict['score-type'] = score_type ssl_line_dict['score'] = csv_line_dict[score_column_name] ssl_line_dict['retention-time'] = rt csv_output.writerow(ssl_line_dict) output_file_object.close() in_file.close() return output_file
def main(input_file=None, output_file=None, score_column_name=None, score_type=None): """ Convert csvs to ssl """ umama = ursgal.UnimodMapper() csv_kwargs = {} if sys.platform == "win32": csv_kwargs["lineterminator"] = "\n" else: csv_kwargs["lineterminator"] = "\r\n" output_file_object = open(output_file, "w") new_fieldnames = [ "file", "scan", "charge", "sequence", "score-type", "score", "retention-time", ] with open(input_file, "r") as in_file: csv_input = csv.DictReader(in_file) csv_output = csv.DictWriter(output_file_object, new_fieldnames, delimiter="\t", **csv_kwargs) csv_output.writeheader() for csv_line_dict in csv_input: rt = round(float(csv_line_dict["Retention Time (s)"]) / 60, 10) sequence = csv_line_dict["Sequence"] mods = csv_line_dict["Modifications"].split(";") pattern = re.compile(r""":(?P<pos>[0-9]*$)""") pos2mass = {} for mod in mods: if mod == "": continue if ":" not in mod: sys.exit( "This unimod: {0} requires positional information". format(mod)) for occ, match in enumerate(pattern.finditer(mod)): try: unimod_mass = umama.name2mass(mod[:match.start()]) except: sys.exit( "Can not map unimod {0}. extracted position argument: {1}" .format(mod, match.start())) position = int(match.group("pos")) if position == 0: position = 1 elif position > len(sequence): position = len(sequence) if position not in pos2mass: pos2mass[position] = 0 pos2mass[position] += unimod_mass if occ >= 1: sys.exit( "Incorrect regex pattern for mod: {0}".format(mod)) seq_incl_mods = "" for p, aa in enumerate(sequence): seq_incl_mods += aa if p + 1 in pos2mass: mass = pos2mass[p + 1] if mass >= 0: seq_incl_mods += "[+{0}]".format(mass) else: seq_incl_mods += "[{0}]".format(mass) ssl_line_dict = {} ssl_line_dict["file"] = csv_line_dict["Raw data location"] ssl_line_dict["scan"] = csv_line_dict["Spectrum ID"] ssl_line_dict["charge"] = csv_line_dict["Charge"] ssl_line_dict["sequence"] = seq_incl_mods ssl_line_dict["score-type"] = score_type ssl_line_dict["score"] = csv_line_dict[score_column_name] ssl_line_dict["retention-time"] = rt csv_output.writerow(ssl_line_dict) output_file_object.close() in_file.close() return output_file
def write_input_tsv(self, input_csv, tmp_dir): """ convert ursgal csv into PTM-Shepherd input tsv (same format as Philosopher output psms tsv) """ print("[ PREFLGHT ] writing PTM-Shepherd config file ...") ptmshep_input = os.path.join( tmp_dir, os.path.basename(input_csv).replace(".csv", ".tsv")) # mod pattern mod_pattern = re.compile(r"""(?P<name>.*):(?P<pos>[0-9]*$)""") # convert csv file into PTM-Shepherd input file fieldnames_list = [ "Spectrum", "Peptide", "Modified Peptide", "Peptide Length", "Charge", "Retention", "Calculated Peptide Mass", "Calculated M/Z", "Delta Mass", "Assigned Modifications", "Is Unique", ] umama = ursgal.UnimodMapper() scan_rt_lookup_path = os.path.join( os.path.dirname( self.params["translations"]["mzml_input_files"][0]), "_ursgal_lookup.pkl", ) # with open(scan_rt_lookup_path, 'rb') as scan_rt_in: # scan_rt_lookup_dict = pickle.load(scan_rt_in) with open(ptmshep_input, "w", newline="") as new_csvfile, open( self.params["translations"]["csv_input_file"], "r") as csv_file: writer = csv.DictWriter(new_csvfile, fieldnames=fieldnames_list, delimiter="\t") writer.writeheader() csv_reader = csv.DictReader(csv_file) for n, row in enumerate(csv_reader): if n % 500 == 0: print("[ PREFLGHT ] Processing line number: {0}".format(n), end="\r") mass_diffs = row["Mass Difference"].split(";") mass_diffs_sum = 0.0 for n, mass in enumerate(mass_diffs): if mass == "": continue if "moda" in row["Search Engine"] and mass.startswith("+"): exp_mass = ursgal.ucore.calculate_mass( float(row["Exp m/z"]), int(row["Charge"])) mass = "{}".format(exp_mass - float(row["uCalc Mass"])) mass_diffs_sum += float(mass.split(":")[0]) if "<|>" in row["Protein ID"]: is_unique = "false" else: is_unique = "true" rt = row.get("Retention Time (s)", "") if rt == "": spectrum_id = int(row["Spectrum ID"]) raw_file_name = os.path.basename(row["Raw data location"]) input_file_basename_for_rt_lookup = raw_file_name.replace( ".mgf", "") retention_time_in_minutes = scan_rt_lookup_dict[ input_file_basename_for_rt_lookup]["scan_2_rt"][ spectrum_id] row["Retention Time (s)"] = retention_time_in_minutes * 60 assert (row["Retention Time (s)"] != ""), """ [ERROR] Retention Time needs to be given for each row. """ tmp_dict = {} tmp_dict["Spectrum"] = row["Spectrum Title"] tmp_dict["Peptide"] = row["Sequence"] tmp_dict["Modified Peptide"] = row["Sequence"] tmp_dict["Peptide Length"] = len(row["Sequence"]) tmp_dict["Charge"] = row["Charge"] tmp_dict["Retention"] = row["Retention Time (s)"] tmp_dict["Calculated Peptide Mass"] = row["uCalc Mass"] tmp_dict["Calculated M/Z"] = row["uCalc m/z"] tmp_dict["Delta Mass"] = mass_diffs_sum tmp_dict["Is Unique"] = is_unique mods = row["Modifications"].split(";") new_mod_list = [] for mod in mods: if mod == "": continue match = mod_pattern.search(mod) pos = int(match.group("pos")) if pos == 0: aa = row["Sequence"][pos] else: aa = row["Sequence"][pos - 1] mass = umama.name2mass(match.group("name")) mod_entry = "{0}{1}({2})".format(pos, aa, mass) new_mod_list.append(mod_entry) tmp_dict["Assigned Modifications"] = ", ".join(new_mod_list) writer.writerow(tmp_dict) print("[ PREFLGHT ] Processing done") return ptmshep_input