def setUp(self): self.uc = ursgal.UController() self.uc.params['translations'] = {} self.uc.UNODE_UPARAMS.update({ 'test_ions': { 'style': 'test_style_1', 'ukey': 'test_ions', 'ukey_translated': '__test_00000_ions', 'default_value': 'Yes', 'default_value_translated': True, 'uvalue_style_translation': { 'Yes': True, 'No': False }, 'triggers_rerun': True }, 'score_test_ions': { 'style': 'test_style_1', 'ukey': 'score_test_ions', 'ukey_translated': '__test_00000_ions', 'default_value': True, 'default_value_translated': 'Please yes translate', 'uvalue_style_translation': { True: 'Please yes translate', False: 'No please leave me alone' }, 'triggers_rerun': True }, 'list_of_things': { 'style': 'test_style_1', 'ukey': 'list_of_things', 'ukey_translated': 'list_of_things', 'default_value': [True, True, True], 'default_value_translated': [True, True, True], 'uvalue_style_translation': {}, 'triggers_rerun': True } })
def run(self): ''' Download all resources from our webpage to ursgal/resources. ''' working_directory = os.path.join( os.path.dirname(os.path.realpath(__file__)), ) if os.path.exists(os.path.join(working_directory, 'ursgal')) is False: print('Could not find ursgal directory') sys.exit(1) import ursgal uc = ursgal.UController() downloaded_zips = uc.download_resources(resources=None) if len(downloaded_zips) == 0: print( '[ INFO ] No engines were downloaded, all should be available') else: print('[ INFO ] Downloaded and installed {0} engine(s)'.format( len(downloaded_zips))) for engine, zip_file in downloaded_zips: print( '[ INFO ] Engine: {0} has been installed from {1}'.format( engine, zip_file))
def main(): ''' Simple example script how to generate a target decoy database. Note: By default a 'shuffled peptide preserving cleavage sites' database is generated. For this script a 'reverse protein' database is generated. usage: ./target_decoy_generation_example.py ''' params = { 'enzyme': 'trypsin', 'decoy_generation_mode': 'reverse_protein', } fasta_database_list = [ os.path.join( os.pardir, 'example_data', 'BSA.fasta' ) ] uc = ursgal.UController( params=params ) new_target_decoy_db_name = uc.execute_misc_engine( input_file=fasta_database_list, engine='generate_target_decoy_1_0_0', output_file_name='my_BSA_target_decoy.fasta', ) print('Generated target decoy database: {0}'.format( new_target_decoy_db_name))
def main(): """ Example for plotting a simple Venn diagram with single ursgal csv files. usage: ./simple_venn_example.py """ uc = ursgal.UController( profile="LTQ XL low res", params={ "visualization_label_positions": { "0": "omssa", "1": "xtandem" } }, ) file_list = [ os.path.join(os.pardir, "tests", "data", "omssa_2_1_9", "test_BSA1_omssa_2_1_9.csv"), os.path.join( os.pardir, "tests", "data", "xtandem_sledgehammer", "test_BSA1_xtandem_sledgehammer.csv", ), ] uc.visualize( input_files=file_list, engine="venndiagram_1_1_0", force=True, ) return
def setUp(self): self.uc = ursgal.UController() self.uc.params["translations"] = {} self.uc.UNODE_UPARAMS.update( { "test_ions": { "style": "test_style_1", "ukey": "test_ions", "ukey_translated": "__test_00000_ions", "default_value": "Yes", "default_value_translated": True, "uvalue_style_translation": {"Yes": True, "No": False}, "triggers_rerun": True, }, "score_test_ions": { "style": "test_style_1", "ukey": "score_test_ions", "ukey_translated": "__test_00000_ions", "default_value": True, "default_value_translated": "Please yes translate", "uvalue_style_translation": { True: "Please yes translate", False: "No please leave me alone", }, "triggers_rerun": True, }, "list_of_things": { "style": "test_style_1", "ukey": "list_of_things", "ukey_translated": "list_of_things", "default_value": [True, True, True], "default_value_translated": [True, True, True], "uvalue_style_translation": {}, "triggers_rerun": True, }, } )
def main(): ''' Download all resources from our webpage ''' uc = ursgal.UController() zip_files_list, update_kb_list =uc.prepare_resources( root_zip_target_folder='/tmp' ) print() print('<<<Summary>>>') if len(zip_files_list) == 0: print('[ INFO ] All files are correctly stored in online repository') else: for zip_file, md5 in zip_files_list: print( '[ INFO ] File: {0} was created with md5: {1}'.format( zip_file, md5 ) ) print() print() if len(update_kb_list) == 0: print('[ INFO ] No kb information has to be updated') else: for engine, message in update_kb_list: print( '[ INFO ] Please update kb for {0}'.format( engine, ) ) print(message) print() return
def main(): ''' Examples script for filtering validated results for a PEP <= 0.01 and remove all decoys. usage: ./filter_csv_validation_example.py Will produce a file with only target sequences with a posterior error probability of lower or equal to 1 percent ''' params = { 'csv_filter_rules': [['PEP', 'lte', 0.01], ['Is decoy', 'equals', 'false']] } csv_file_to_filter = os.path.join( os.pardir, 'example_data', 'misc', 'filter_csv_validation_example_omssa_2_1_9_unified_percolator_2_08_validated.csv' ) uc = ursgal.UController(params=params) filtered_csv = uc.filter_csv(input_file=csv_file_to_filter, )
def main(class_version): ''' Example script to demonstrate speed and memory efficiency of the new upeptide_mapper. All tryptic peptides (n=1,094,395, 6 < len(peptide) < 40 ) are mapped to the Chlamydomonas reinhardtii (38876 entries) target-decoy database. usage: ./complete_chlamydomonas_proteome_match.py <class_version> Class versions * UPeptideMapper_v2 * UPeptideMapper_v3 * UPeptideMapper_v4 ''' input_params = { 'database': os.path.join( os.pardir, 'example_data', 'Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta' ), 'http_url': 'https://www.sas.upenn.edu/~sschulze/Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta', 'http_output_folder': os.path.join( os.pardir, 'example_data', ) } uc = ursgal.UController(params=input_params) if os.path.exists(input_params['database']) is False: uc.fetch_file(engine='get_http_files_1_0_0') print('Parsing fasta and digesting sequences') peptides = set() digest_start = time.time() for fastaID, sequence in ursgal.ucore.parse_fasta( open(input_params['database'], 'r')): tryptic_peptides = ursgal.ucore.digest(sequence, ('KR', 'C'), no_missed_cleavages=True) for p in tryptic_peptides: if 6 <= len(p) <= 40: peptides.add(p) print('Parsing fasta and digesting sequences took {0:1.2f} seconds'.format( time.time() - digest_start)) if sys.platform == 'win32': print( '[ WARNING ] pyahocorasick can not be installed via pip on Windwows at the moment\n' '[ WARNING ] Falling back to UpeptideMapper_v2') class_version = 'UPeptideMapper_v2' upapa_class = uc.unodes['upeptide_mapper_1_0_0'][ 'class'].import_engine_as_python_function(class_version) print('Buffering fasta and mapping {0} peptides'.format(len(peptides))) map_start = time.time() if class_version == 'UPeptideMapper_v2': peptide_mapper = upapa_class(word_len=6) fasta_lookup_name = peptide_mapper.build_lookup_from_file( input_params['database'], force=False, ) args = [list(peptides), fasta_lookup_name] elif class_version == 'UPeptideMapper_v3': peptide_mapper = upapa_class(input_params['database']) fasta_lookup_name = peptide_mapper.fasta_name args = [list(peptides), fasta_lookup_name] elif class_version == 'UPeptideMapper_v4': peptide_mapper = upapa_class(input_params['database']) args = [list(peptides)] p2p_mappings = peptide_mapper.map_peptides(*args) print('Buffering fasta and mapping {0} peptides took {1:1.2f} seconds'. format(len(peptides), time.time() - map_start)) if len(p2p_mappings.keys()) == len(peptides): print('All peptides have been mapped!') else: print('WARNING: Not all peptide have been mapped')
def main(folder=None, enzyme=None, target_decoy_database=None): ''' ''' # # define folder with mzML_files as sys.argv[1] mzML_files = [] offset_files = [] for sample in offsets.keys(): for mzml in glob.glob(os.path.join(folder, sample, '*.mzML')): mzML_files.append(mzml) for offset_file in offsets[sample].keys(): offset_files.append(offset_file) for mzml in mzML_files: if os.path.basename(mzml) not in offset_files: print( 'mzML file in folder but NOT in offset dict: {}'.format(mzml)) exit() mass_spectrometer = 'QExactive+' search_engines = [ 'xtandem_vengeance', 'msfragger_20190222', 'msgfplus_v2019_04_18', ] validation_engine = 'percolator_3_4_0' params = { 'database': target_decoy_database, 'enzyme': enzyme, 'csv_filter_rules': [ ['Is decoy', 'equals', 'false'], ['PEP', 'lte', 0.01], ['Conflicting uparam', 'contains_not', 'enzyme'], ], 'precursor_mass_tolerance_minus': 8, 'precursor_mass_tolerance_plus': 8, 'frag_mass_tolerance': 0.4, 'frag_mass_tolerance_unit': 'da', 'rounded_mass_decimals': 2, '-xmx': '32g', 'peptide_mapper_class_version': 'UPeptideMapper_v4', 'use_pyqms_for_mz_calculation': True, 'semi_enzyme': True, 'precursor_min_charge': 1, 'precursor_max_charge': 5, 'percolator_post_processing': 'mix-max', 'psm_defining_colnames': [ 'Spectrum Title', 'Sequence', 'Modifications', 'Charge', 'Is decoy', ], } uc = ursgal.UController(profile=mass_spectrometer, params=params) all_result_files = [] semi_result_files = [] full_result_files = [] for n, sample in enumerate(offsets.keys()): all_validated_result_files = [] semi_validated_result_files = [] full_validated_result_files = [] combined_pep_result_files = [] for search_engine in search_engines: results = [] for spec_file in offsets[sample].keys(): offset = offsets[sample][spec_file] if offset == 'skip': continue uc.params['machine_offset_in_ppm'] = offset dirname = folder mzml_file = os.path.join(dirname, spec_file) mgf_file = uc.convert( input_file=mzml_file, engine='mzml2mgf_2_0_0', ) uc.params['modifications'] = [ 'C,fix,any,Carbamidomethyl', 'M,opt,any,Oxidation', '*,opt,Prot-N-term,Acetyl', ] search_result = uc.search_mgf( input_file=mgf_file, engine=search_engine, ) uc.params['prefix'] = '' converted_result = uc.convert( input_file=search_result, guess_engine=True, ) mapped_results = uc.execute_misc_engine( input_file=converted_result, engine='upeptide_mapper', ) unified_search_results = uc.execute_misc_engine( input_file=mapped_results, engine='unify_csv' # force = True, ) results_one_engine = uc.execute_misc_engine( input_file=results, engine='merge_csvs', # merge_duplicates=True, # force=True, ) all_validated_csv = uc.validate( input_file=results_one_engine, engine=validation_engine, ) all_validated_result_files.append(all_validated_csv) uc.params.update({ 'csv_filter_rules': [ ['Enzyme Specificity', 'contains_not', 'full'], ], 'prefix': 'Semi', }) semi_filtered_csv = uc.execute_misc_engine( input_file=results_one_engine, engine='filter_csv', ) semi_validated_csv = uc.validate( input_file=semi_filtered_csv, engine=validation_engine, ) semi_validated_result_files.append(semi_validated_csv) uc.params.update({ 'csv_filter_rules': [ ['Enzyme Specificity', 'contains', 'full'], ], 'prefix': 'Full', }) full_filtered_csv = uc.execute_misc_engine( input_file=results_one_engine, engine='filter_csv', ) full_validated_csv = uc.validate( input_file=full_filtered_csv, engine=validation_engine, ) full_validated_result_files.append(full_validated_csv) uc.params.update({ 'csv_filter_rules': [ ['Is decoy', 'equals', 'false'], ['PEP', 'lte', 0.01], ['Conflicting uparam', 'contains_not', 'enzyme'], ], 'prefix': '', }) all_combined_results = uc.combine_search_results( input_files=all_validated_result_files, engine='combine_pep_1_0_0', ) semi_combined_results = uc.combine_search_results( input_files=semi_validated_result_files, engine='combine_pep_1_0_0', ) full_combined_results = uc.combine_search_results( input_files=full_validated_result_files, engine='combine_pep_1_0_0', ) uc.params['csv_filter_rules'] = [ # ['Is decoy', 'equals', 'false'], ['combined PEP', 'lte', 0.01], ['Conflicting uparam', 'contains_not', 'enzyme'], ] all_filtered_combined_results = uc.execute_misc_engine( input_file=all_combined_results, engine='filter_csv', ) all_result_files.append(all_filtered_combined_results) semi_filtered_combined_results = uc.execute_misc_engine( input_file=semi_combined_results, engine='filter_csv', ) semi_result_files.append(semi_filtered_combined_results) full_filtered_combined_results = uc.execute_misc_engine( input_file=full_combined_results, engine='filter_csv', ) full_result_files.append(full_filtered_combined_results) for l in [all_result_files, semi_result_files, full_result_files]: uc.params.update({ 'psm_defining_colnames': [ 'Spectrum Title', 'Sequence', 'Modifications', 'Charge', 'Is decoy', ], }) all_files = uc.execute_misc_engine( input_file=l, engine='merge_csvs', merge_duplicates=True, ) uc.params.update({ 'validation_score_field': 'combined PEP', 'bigger_scores_better': False, 'num_compared_psms': 10, 'accept_conflicting_psms': False, 'threshold_is_log10': True, 'score_diff_threshold': 1, 'psm_defining_colnames': [ 'Spectrum Title', 'Sequence', ], }) sanitized_combined_results = uc.execute_misc_engine( input_file=all_files, engine='sanitize_csv', )
def unify_csv(file, engine): uc = ursgal.UController(params=params, profile="QExactive+", verbose=False) uc.scan_rt_lookup_path = "tests/data/_ursgal_test_lookup.pkl" uc.map_mods() unify_csv_main = uc.unodes["unify_csv_1_0_0"][ "class"].import_engine_as_python_function() output_csv = os.path.join("tests", "data", os.path.splitext(file)[0] + "_unified.csv") input_csv = os.path.join("tests", "data", file) scan_rt_lookup = pickle.load( open(os.path.join("tests", "data", "_ursgal_test_pickle.pkl"), "rb")) unify_csv_main( input_file=input_csv, output_file=output_csv, scan_rt_lookup=scan_rt_lookup, params={ "translations": { "decoy_tag": "decoy_", "enzyme": "KR;C;P", "semi_enzyme": False, "database": os.path.join("tests", "data", "P0ADZ4.fasta"), "protein_delimiter": "<|>", "psm_merge_delimiter": ";", "keep_asp_pro_broken_peps": True, "precursor_mass_tolerance_minus": 5, "precursor_mass_tolerance_plus": 5, "precursor_isotope_range": "0,1", "max_missed_cleavages": 2, "rounded_mass_decimals": 3, "use_pyqms_for_mz_calculation": False, "aa_exception_dict": { "J": { "original_aa": ["L", "I"], }, "O": { "original_aa": ["K"], "unimod_name": "Methylpyrroline", }, }, }, "label": "", "mods": uc.params["mods"], "prefix": "", "psm_defining_colnames": [ "Spectrum Title", "Sequence", "Modifications", "Mass Difference", "Charge", "Is decoy", ], }, search_engine=engine, ) reader_produced = [line for line in csv.DictReader(open(output_csv))] reader_expected = [ line for line in csv.DictReader(open(output_csv + "_expected.csv")) ] for pos, line in enumerate(reader_produced): print("#{pos:0>5} Produced: {mod}".format(pos=pos, mod=line["Modifications"])) print("#{pos:0>5} Expected: {mod}".format( pos=pos, mod=reader_expected[pos]["Modifications"])) assert line["Modifications"] == reader_expected[pos]["Modifications"]
def main(folder=None, enzyme=None, target_decoy_database=None): ''' Workflow for the analysis a dataset with multiple runs per sample. Usage: python <script_name.py> <folder_with_mzML> <enzyme> <path_to_database> ''' # define folder with mzML_files as sys.argv[1] mzML_files = [] offset_files = [] for sample in offsets.keys(): for mzml in glob.glob(os.path.join(folder, sample, '*.mzML')): mzML_files.append(mzml) for offset_file in offsets[sample].keys(): offset_files.append(offset_file) for mzml in mzML_files: if os.path.basename(mzml) not in offset_files: print( 'mzML file in folder but NOT in offset dict: {0}'.format(mzml)) exit() mass_spectrometer = 'QExactive+' search_engines = [ 'xtandem_vengeance', 'msfragger_20190222', 'msgfplus_v2019_04_18', ] validation_engine = 'percolator_3_4_0' params = { 'database': target_decoy_database, 'enzyme': enzyme, 'precursor_mass_tolerance_minus': 10, 'precursor_mass_tolerance_plus': 10, 'frag_mass_tolerance': 10, 'frag_mass_tolerance_unit': 'ppm', 'rounded_mass_decimals': 2, '-xmx': '32g', 'peptide_mapper_class_version': 'UPeptideMapper_v4', 'use_pyqms_for_mz_calculation': True, 'percolator_post_processing': 'mix-max', 'psm_defining_colnames': [ 'Spectrum Title', 'Sequence', 'Modifications', 'Charge', 'Is decoy', ], 'max_missed_cleavages': 3, } uc = ursgal.UController(profile=mass_spectrometer, params=params) all_result_files = [] for n, sample in enumerate(offsets.keys()): validated_result_files = [] combined_pep_result_files = [] for search_engine in search_engines: results = [] for spec_file in offsets[sample].keys(): basename = spec_file dirname = os.path.join(folder) offset = offsets[sample][basename] spec_file_path = os.path.join(dirname, basename) if offset == 'skip': continue uc.params['machine_offset_in_ppm'] = offset mgf_file = uc.convert( input_file=spec_file_path, engine='mzml2mgf_2_0_0', ) uc.params['modifications'] = [ 'C,fix,any,Carbamidomethyl', 'M,opt,any,Oxidation', '*,opt,Prot-N-term,Acetyl', ] search_result = uc.search_mgf( input_file=mgf_file, engine=search_engine, ) converted_result = uc.convert( input_file=search_result, guess_engine=True, ) mapped_results = uc.execute_misc_engine( input_file=converted_result, engine='upeptide_mapper', ) unified_search_results = uc.execute_misc_engine( input_file=mapped_results, engine='unify_csv') results.append(unified_search_results) # validated_single_csv = uc.validate( # input_file = unified_search_results, # engine = validation_engine, # ) # # uc.params['csv_filter_rules'] = [ # # ['Is decoy', 'equals', 'false'], # ['combined PEP','lte', 0.01], # ['Conflicting uparam', 'contains_not', 'enzyme'], # ] # filtered_combined_results = uc.execute_misc_engine( # input_file = validated_single_csv, # engine='filter_csv', # ) uc.params['prefix'] = sample results_one_engine = uc.execute_misc_engine( input_file=results, engine='merge_csvs', # merge_duplicates=True, ) uc.params['prefix'] = '' validated_csv = uc.validate( input_file=results_one_engine, engine=validation_engine, ) # filtered_combined_results = uc.execute_misc_engine( # input_file = validated_csv, # engine='filter_csv', # ) validated_result_files.append(validated_csv) combined_results = uc.combine_search_results( input_files=validated_result_files, engine='combine_pep_1_0_0', ) uc.params['csv_filter_rules'] = [ ['combined PEP', 'lte', 0.01], ['Conflicting uparam', 'contains_not', 'enzyme'], ] filtered_combined_results = uc.execute_misc_engine( input_file=combined_results, engine='filter_csv', ) all_result_files.append(filtered_combined_results) results_all_files = uc.execute_misc_engine( input_file=all_result_files, engine='merge_csvs', merge_duplicates=True, ) uc.params.update({ 'validation_score_field': 'combined PEP', 'bigger_scores_better': False, 'num_compared_psms': 10, 'accept_conflicting_psms': False, 'threshold_is_log10': True, 'score_diff_threshold': 1, 'psm_defining_colnames': [ 'Spectrum Title', 'Sequence', ], }) sanitized_combined_results = uc.execute_misc_engine( input_file=results_all_files, engine='sanitize_csv', )
def main(): ''' Example script to do a simple machine ppm offset parameter sweep. The m/z values in the example mgf file are stepwise changed and the in the final output the total peptides are counted. usage: ./bsa_ppm_offset_test.py Note: As expected, if the offset becomes to big no peptides can be found anymore. ''' ppm_offsets = [ (-10, '-10_ppm_offset'), (-9, '-9_ppm_offset'), (-8, '-8_ppm_offset'), (-7, '-7_ppm_offset'), (-6, '-6_ppm_offset'), (-5, '-5_ppm_offset'), (-4, '-4_ppm_offset'), (-3, '-3_ppm_offset'), (-2, '-2_ppm_offset'), (-1, '-1_ppm_offset'), (None, '0_ppm_offset'), (1, '1_ppm_offset'), (2, '2_ppm_offset'), (3, '3_ppm_offset'), (4, '4_ppm_offset'), (5, '5_ppm_offset'), (6, '6_ppm_offset'), (7, '7_ppm_offset'), (8, '8_ppm_offset'), (9, '9_ppm_offset'), (10, '10_ppm_offset'), ] engine_list = ['xtandem_vengeance'] R = ursgal.UController( profile='LTQ XL low res', params={ 'database': os.path.join(os.pardir, 'example_data', 'BSA.fasta'), 'modifications': [ 'M,opt,any,Oxidation', # Met oxidation 'C,fix,any,Carbamidomethyl', # Carbamidomethylation '*,opt,Prot-N-term,Acetyl' # N-Acteylation ], }) mzML_file = os.path.join(os.pardir, 'example_data', 'BSA_machine_ppm_offset_example', 'BSA1.mzML') if os.path.exists(mzML_file) is False: R.params[ 'http_url'] = 'http://sourceforge.net/p/open-ms/code/HEAD/tree/OpenMS/share/OpenMS/examples/BSA/BSA1.mzML?format=raw' R.params['http_output_folder'] = os.path.dirname(mzML_file) R.fetch_file(engine='get_http_files_1_0_0') try: shutil.move('{0}?format=raw'.format(mzML_file), mzML_file) except: shutil.move('{0}format=raw'.format(mzML_file), mzML_file) for engine in engine_list: for (ppm_offset, prefix) in ppm_offsets: R.params['machine_offset_in_ppm'] = ppm_offset R.params['prefix'] = prefix unified_search_result_file = R.search( input_file=mzML_file, engine=engine, force=False, ) collector = ddict(set) for csv_path in glob.glob('{0}/*/*unified.csv'.format( os.path.dirname(mzML_file))): for line_dict in csv.DictReader(open(csv_path, 'r')): collector[csv_path].add(line_dict['Sequence']) for csv_path, peptide_set in sorted(collector.items()): file_name = os.path.basename(csv_path) offset = file_name.split('_')[0] print('Search with {0: >3} ppm offset found {1: >2} peptides'.format( offset, len(peptide_set))) return
#!/usr/bin/env python3.4 # encoding: utf-8 ''' Test the unify_csv function for msgfplus engine ''' import ursgal import csv import pickle import os R = ursgal.UController() scan_rt_lookup = pickle.load( open( os.path.join( 'tests', 'data', '_test_ursgal_lookup.pkl') , 'rb' ) ) unify_csv_main = R.unodes['unify_csv_1_0_0']['class'].import_engine_as_python_function() input_csv = os.path.join( 'tests', 'data', 'novor_1_1beta',
def main(dirpath, skip_old=False, num_specs=1): uc = ursgal.UController() uc.params.update( { "bigger_scores_better": False, "num_compared_psms": 10, "accept_conflicting_psms": False, "threshold_is_log10": True, "score_diff_threshold": 1, "psm_defining_colnames": [ "Spectrum Title", "Sequence", ], } ) pkl_name = os.path.join(dirpath, "datasets_result.pkl") fdr_pkl_name = os.path.join(dirpath, "fdr_result.pkl") old_exists = False if os.path.exists(pkl_name) and skip_old is True: # load results from previous analysis # will only add datasets that are not part of it already print(">>>>>>>> loading pkl <<<<<<<<<<<") results_dict = pickle.load(open(pkl_name, "rb")) fdr_dict = pickle.load(open(fdr_pkl_name, "rb")) old_exists = True else: # collect proteins and peptides from result csv, # store in dict with all important data results_dict = { "all": { "num_spectra": 0, "instrument": set(), "lab": set(), # protein_groups, proteins and peptides are dicts that contain sets for each level of confidence "protein_groups": { "all": set(), "safe_psm": set(), "safe_seq": set(), "safe_seq_num_spec": set(), "safe_seq_num_spec_0005": set(), }, "proteins": { "all": set(), "safe_psm": set(), "safe_seq": set(), "safe_seq_num_spec": set(), "safe_seq_num_spec_0005": set(), }, "peptides": {"all": set(), "safe": set(), "safe_num_specs": set()}, "spectra": {"all": set()}, # protein_dict in contrast is a nested dict with protein/protein_group --> peptide sequence --> spectral information # (containing lists of 'spec_title', 'bayes_pep', modifications', 'charge', 'psm_q_value', 'start_stop') "protein_dict": {}, } } fdr_dict = { "peptides_seq_level": {}, "peptides_psm_level": {}, "peptides_seq_level_2specs": {}, "glycopeptides_psm_level": {}, "glycopeptides_seq_level": {}, "glycopeptides_seq_level_2specs": {}, "proteins_seq_level": {}, "proteins_psm_level": {}, "proteins_seq_level_2specs": {}, } result_file_list = [] for PRIDE_ID in datasets.keys(): if skip_old is True and old_exists is True and PRIDE_ID in results_dict: continue print("reading:", PRIDE_ID) instrument = datasets[PRIDE_ID]["instrument"] results_dict["all"]["instrument"].add(instrument) lab = datasets[PRIDE_ID]["lab"] results_dict["all"]["lab"].add(lab) results_dict["all"]["num_spectra"] += datasets[PRIDE_ID]["num_spectra"] if PRIDE_ID not in results_dict.keys(): results_dict[PRIDE_ID] = { "num_spectra": datasets[PRIDE_ID]["num_spectra"], "instrument": instrument, "lab": lab, "protein_groups": { "all": set(), "safe_psm": set(), "safe_seq": set(), "safe_seq_num_spec": set(), "safe_seq_num_spec_0005": set(), }, "proteins": { "all": set(), "safe_psm": set(), "safe_seq": set(), "safe_seq_num_spec": set(), "safe_seq_num_spec_0005": set(), }, "peptides": {"all": set(), "safe": set(), "safe_num_specs": set()}, "spectra": {"all": set()}, "protein_dict": {}, } # collect proteins, peptides and corresponding spectrum_titles if datasets[PRIDE_ID]["folders"] != "": PRIDE_folder = os.path.join(PRIDE_ID, datasets[PRIDE_ID]["folders"]) else: PRIDE_folder = PRIDE_ID merged_file = os.path.join( PRIDE_folder, datasets[PRIDE_ID]["result_file"], ) result_file_list.append(merged_file) protein_ids = set() protein_groups = set() with open(merged_file, "r") as in_file: result_csv = csv.DictReader(in_file) for line_dict in result_csv: seq = line_dict["Sequence"] # + line_dict['Modifications'] mod = line_dict["Modifications"] mods = [] # In contrast to the original ArcPP analysis, modifications are taken into account # except for optional modifications that depend on the sample preparation. # In the following, commented out sections indicate the use of "seq" # that has now been changed to "seq_mod". for m in line_dict["Modifications"].split(";"): if "iTRAQ4plex" in m or "Label:" in m or "Oxidation" in m: continue mods.append(m) charge = line_dict["Charge"] # seq_mod = '{0}#{1}'.format(seq, mod) seq_mod = "{0}#{1}".format(seq, ";".join(mods)) seq_length = len(seq) spec_title = line_dict["Spectrum Title"] sample = spec_title.split(".")[0] is_decoy = line_dict["Is decoy"] prot = line_dict["Protein ID"] start = line_dict["Sequence Start"] stop = line_dict["Sequence Stop"] pre = line_dict["Sequence Pre AA"] post = line_dict["Sequence Post AA"] psm_q_value = float(line_dict["combined PEP"]) bayes_pep = float(line_dict["Bayes PEP"]) if psm_q_value <= 0.01: if seq_length not in fdr_dict["peptides_psm_level"].keys(): fdr_dict["peptides_psm_level"][seq_length] = {} if seq not in fdr_dict["peptides_psm_level"][seq_length].keys(): fdr_dict["peptides_psm_level"][seq_length][seq] = ( psm_q_value, is_decoy, ) elif ( psm_q_value < fdr_dict["peptides_psm_level"][seq_length][seq][0] ): fdr_dict["peptides_psm_level"][seq_length][seq] = ( psm_q_value, is_decoy, ) if ( "Hex" in mod and seq_length not in fdr_dict["glycopeptides_psm_level"].keys() ): fdr_dict["glycopeptides_psm_level"][seq_length] = {} if ( "Hex" in mod and seq_mod not in fdr_dict["glycopeptides_psm_level"][seq_length].keys() ): fdr_dict["glycopeptides_psm_level"][seq_length][seq_mod] = ( psm_q_value, is_decoy, ) elif ( "Hex" in mod and psm_q_value < fdr_dict["glycopeptides_psm_level"][seq_length][seq_mod][0] ): fdr_dict["glycopeptides_psm_level"][seq_length][seq_mod] = ( psm_q_value, is_decoy, ) else: print( "Results should be filtered by combined PEP <= 1% (but should contain targets and decoys)" ) sys.exit(1) # differentiate between protein groups and proteins # and remove contaminants if len(prot.split("<|>")) > 1: contaminants = True for p in prot.split("<|>"): prot_id = p.split(" ")[0] if "HVO" not in prot_id: continue else: contaminants = False # contaminants = False if contaminants is False and is_decoy == "false": results_dict[PRIDE_ID]["protein_groups"]["all"].add( line_dict["Protein ID"] ) results_dict[PRIDE_ID]["peptides"]["all"].add(seq_mod) # results_dict[PRIDE_ID]['peptides']['all'].add(seq) results_dict[PRIDE_ID]["spectra"]["all"].add(spec_title) else: contaminants = False prot_id = prot.split(" ")[0] if "HVO" not in prot_id: contaminants = True if contaminants is False and is_decoy == "false": results_dict[PRIDE_ID]["proteins"]["all"].add( line_dict["Protein ID"] ) results_dict[PRIDE_ID]["peptides"]["all"].add(seq_mod) # results_dict[PRIDE_ID]['peptides']['all'].add(seq) results_dict[PRIDE_ID]["spectra"]["all"].add(spec_title) # add info to protein_dict if prot not in results_dict[PRIDE_ID]["protein_dict"].keys(): results_dict[PRIDE_ID]["protein_dict"][prot] = {} # if seq not in results_dict[PRIDE_ID]['protein_dict'][prot].keys(): # results_dict[PRIDE_ID]['protein_dict'][prot][seq] = { if seq_mod not in results_dict[PRIDE_ID]["protein_dict"][prot].keys(): results_dict[PRIDE_ID]["protein_dict"][prot][seq_mod] = { "spec_title": [], "bayes_pep": [], "modifications": [], "charge": [], "psm_q_value": [], "start_stop": (start, stop, pre, post), } results_dict[PRIDE_ID]["protein_dict"][prot][seq_mod][ "spec_title" ].append(spec_title) results_dict[PRIDE_ID]["protein_dict"][prot][seq_mod][ "bayes_pep" ].append(bayes_pep) results_dict[PRIDE_ID]["protein_dict"][prot][seq_mod][ "psm_q_value" ].append(psm_q_value) results_dict[PRIDE_ID]["protein_dict"][prot][seq_mod][ "modifications" ].append(mod) results_dict[PRIDE_ID]["protein_dict"][prot][seq_mod]["charge"].append( charge ) # results_dict[PRIDE_ID]['protein_dict'][prot][seq]['spec_title'].append(spec_title) # results_dict[PRIDE_ID]['protein_dict'][prot][seq]['bayes_pep'].append(bayes_pep) # results_dict[PRIDE_ID]['protein_dict'][prot][seq]['psm_q_value'].append(psm_q_value) # results_dict[PRIDE_ID]['protein_dict'][prot][seq]['modifications'].append(mod) # results_dict[PRIDE_ID]['protein_dict'][prot][seq]['charge'].append(charge) # merge identifications from each dataset into "all" for level in ["protein_groups", "proteins", "peptides", "spectra"]: results_dict["all"][level]["all"] |= results_dict[PRIDE_ID][level]["all"] for prot in results_dict[PRIDE_ID]["protein_dict"].keys(): if prot not in results_dict["all"]["protein_dict"].keys(): results_dict["all"]["protein_dict"][prot] = {"datasets": set()} results_dict["all"]["protein_dict"][prot]["datasets"].add(PRIDE_ID) for seq in results_dict[PRIDE_ID]["protein_dict"][prot].keys(): start_stop = results_dict[PRIDE_ID]["protein_dict"][prot][seq][ "start_stop" ] if seq not in results_dict["all"]["protein_dict"][prot].keys(): results_dict["all"]["protein_dict"][prot][seq] = { "spec_title": [], "bayes_pep": [], "modifications": [], "charge": [], "psm_q_value": [], "start_stop": start_stop, } for k, v in results_dict[PRIDE_ID]["protein_dict"][prot][seq].items(): if k == "start_stop": continue results_dict["all"]["protein_dict"][prot][seq][k].extend(v) # Calculate q-values # peptides first, then proteins for PRIDE_ID in results_dict.keys(): # generate input dict for q_value calculation function seq_q_value_dict = {} for prot in results_dict[PRIDE_ID]["protein_dict"].keys(): for seq_mod in results_dict[PRIDE_ID]["protein_dict"][prot].keys(): if seq_mod == "datasets": continue seq_length = len(seq_mod.split("#")[0]) min_bayes_pep = min( results_dict[PRIDE_ID]["protein_dict"][prot][seq_mod]["bayes_pep"] ) if seq_length not in seq_q_value_dict.keys(): seq_q_value_dict[seq_length] = {} if "decoy_" in prot: is_decoy = True else: is_decoy = False seq_q_value_dict[seq_length][seq_mod] = { "Bayes PEP": min_bayes_pep, "Is decoy": is_decoy, } # for seq in results_dict[PRIDE_ID]['protein_dict'][prot].keys(): # if seq == 'datasets': # continue # seq_length = len(seq) # min_bayes_pep = min( # results_dict[PRIDE_ID]['protein_dict'][prot][seq]['bayes_pep'] # ) # if seq_length not in seq_q_value_dict.keys(): # seq_q_value_dict[seq_length] = {} # if 'decoy_' in prot: # is_decoy = True # else: # is_decoy = False # seq_q_value_dict[seq_length][seq] = { # 'Bayes PEP' : min_bayes_pep, # 'Is decoy' : is_decoy, # } print("calculating q-values on peptide level") seq_calc_q_value_dict = calculate_q_value_by_group( seq_q_value_dict, sliding=False ) # read results from peptide q_value calc, at the same time # generate input dict for proteins for q_value calculation function prot_q_value_dict = {"seq_level": {}, "psm_level": {}} for prot in results_dict[PRIDE_ID]["protein_dict"].keys(): contaminants = False prot_id = prot.split(" ")[0] if "HVO" not in prot_id: contaminants = True if "decoy_" in prot: is_decoy = True else: is_decoy = False for seq in results_dict[PRIDE_ID]["protein_dict"][prot].keys(): if seq == "datasets": continue # seq_length = len(seq) seq_length = len(seq.split("#")[0]) seq_q_value = seq_calc_q_value_dict[seq_length][seq]["combined PEP"] results_dict[PRIDE_ID]["protein_dict"][prot][seq][ "seq_q_value" ] = seq_q_value if seq_q_value <= SEQ_Q_VALUE_THRESHOLD: if PRIDE_ID == "all": if "Hex" in seq: if ( seq_length not in fdr_dict["glycopeptides_seq_level"].keys() ): fdr_dict["glycopeptides_seq_level"][seq_length] = {} fdr_dict["glycopeptides_seq_level"][seq_length][seq] = ( seq_q_value, is_decoy, ) else: if seq_length not in fdr_dict["peptides_seq_level"].keys(): fdr_dict["peptides_seq_level"][seq_length] = {} fdr_dict["peptides_seq_level"][seq_length][seq] = ( seq_q_value, is_decoy, ) counts = len( set( results_dict[PRIDE_ID]["protein_dict"][prot][seq][ "spec_title" ] ) ) if is_decoy is False and contaminants is False: results_dict[PRIDE_ID]["peptides"]["safe"].add(seq) if counts >= num_specs: results_dict[PRIDE_ID]["peptides"]["safe_num_specs"].add(seq) if PRIDE_ID == "all": if "Hex" in seq: if ( seq_length not in fdr_dict[ "glycopeptides_seq_level_2specs" ].keys() ): fdr_dict["glycopeptides_seq_level_2specs"][ seq_length ] = {} fdr_dict["glycopeptides_seq_level_2specs"][ seq_length ][seq] = (seq_q_value, is_decoy) else: if ( seq_length not in fdr_dict[ "peptides_seq_level_2specs" ].keys() ): fdr_dict["peptides_seq_level_2specs"][ seq_length ] = {} fdr_dict["peptides_seq_level_2specs"][seq_length][ seq ] = (seq_q_value, is_decoy) min_bayes_pep = min( results_dict[PRIDE_ID]["protein_dict"][prot][seq]["bayes_pep"] ) if min_bayes_pep == 0.0: min_bayes_pep = np.nextafter(0, 1) log_seq_bayes = math.log10(min_bayes_pep) if prot not in prot_q_value_dict["seq_level"].keys(): prot_q_value_dict["seq_level"][prot] = { "Bayes PEP": log_seq_bayes, "Is decoy": is_decoy, } else: prot_q_value_dict["seq_level"][prot][ "Bayes PEP" ] += log_seq_bayes for bayes_pep in results_dict[PRIDE_ID]["protein_dict"][prot][seq][ "bayes_pep" ]: if bayes_pep == 0.0: bayes_pep = np.nextafter(0, 1) log_psm_bayes = math.log10(bayes_pep) if prot not in prot_q_value_dict["psm_level"].keys(): prot_q_value_dict["psm_level"][prot] = { "Bayes PEP": log_seq_bayes, "Is decoy": is_decoy, } else: prot_q_value_dict["psm_level"][prot][ "Bayes PEP" ] += log_seq_bayes print("calculating q-values on protein level") prot_calc_q_value_dict = calculate_q_value_by_group( prot_q_value_dict, sliding=False, picked_fdr=True ) # read results from protein q_value calc for prot in results_dict[PRIDE_ID]["protein_dict"].keys(): contaminants = False prot_id = prot.split(" ")[0] if "HVO" not in prot_id: contaminants = True if "decoy_" in prot: is_decoy = True else: is_decoy = False for level in ["psm_level", "seq_level"]: if prot in prot_calc_q_value_dict[level].keys(): prot_q_value = prot_calc_q_value_dict[level][prot]["combined PEP"] prot_bayes_pep = prot_calc_q_value_dict[level][prot]["Bayes PEP"] else: prot_q_value = 1 prot_bayes_pep = 1 # count number of spectra for each prot (for seq FDR > 1%) # collect samples for simple protein inference model counts = 0 samples = set() for seq in results_dict[PRIDE_ID]["protein_dict"][prot].keys(): if seq in [ "datasets", "prot_q_value_seq", "prot_q_value_psm", "samples", ]: continue if ( results_dict[PRIDE_ID]["protein_dict"][prot][seq]["seq_q_value"] > 0.01 ): continue psm_set = set( results_dict[PRIDE_ID]["protein_dict"][prot][seq]["spec_title"] ) counts += len(psm_set) for psm in psm_set: ms_filename = ".".join(psm.split(".")[:-3]) samples.add(ms_filename2sample.get(ms_filename, ms_filename)) if PRIDE_ID == "all": if level == "seq_level": fdr_dict["proteins_seq_level"][prot] = (prot_bayes_pep, is_decoy) if counts >= num_specs: fdr_dict["proteins_seq_level_2specs"][prot] = ( prot_bayes_pep, is_decoy, ) else: fdr_dict["proteins_psm_level"][prot] = (prot_bayes_pep, is_decoy) if prot_q_value <= 0.01 and is_decoy is False and contaminants is False: if level == "seq_level": if len(prot.split("<|>")) > 1: results_dict[PRIDE_ID]["protein_groups"]["safe_seq"].add( prot ) if counts >= num_specs: results_dict[PRIDE_ID]["protein_groups"][ "safe_seq_num_spec" ].add(prot) if prot_q_value <= PROT_Q_VALUE_THRESHOLD: results_dict[PRIDE_ID]["protein_groups"][ "safe_seq_num_spec_0005" ].add(prot) else: results_dict[PRIDE_ID]["proteins"]["safe_seq"].add(prot) if counts >= num_specs: results_dict[PRIDE_ID]["proteins"][ "safe_seq_num_spec" ].add(prot) if prot_q_value <= PROT_Q_VALUE_THRESHOLD: results_dict[PRIDE_ID]["proteins"][ "safe_seq_num_spec_0005" ].add(prot) elif counts >= num_specs: if len(prot.split("<|>")) > 1: results_dict[PRIDE_ID]["protein_groups"]["safe_psm"].add( prot ) else: results_dict[PRIDE_ID]["proteins"]["safe_psm"].add(prot) if level == "seq_level": results_dict[PRIDE_ID]["protein_dict"][prot][ "prot_q_value_seq" ] = prot_q_value else: results_dict[PRIDE_ID]["protein_dict"][prot][ "prot_q_value_psm" ] = prot_q_value results_dict[PRIDE_ID]["protein_dict"][prot]["samples"] = samples print( "Number of confident protein identifications for {0}: {1}".format( PRIDE_ID, len(results_dict[PRIDE_ID]["proteins"]["safe_seq_num_spec_0005"]), ) ) # save results in a pkl pickle.dump(results_dict, open(pkl_name, "wb")) print("pickled results: ", pkl_name) pickle.dump(fdr_dict, open(fdr_pkl_name, "wb")) print("pickled fdr_dict: ", fdr_pkl_name)
def main(fasta_database, class_version): ''' Example script to demonstrate speed and memory efficiency of the new upeptide_mapper. Specify fasta_database and class_version as input. usage: ./complete_proteome_match.py <fasta_database> <class_version> Class versions * UPeptideMapper_v2 * UPeptideMapper_v3 * UPeptideMapper_v4 ''' input_params = { 'database': sys.argv[1], } uc = ursgal.UController(params=input_params) print('Parsing fasta and digesting sequences') peptides = set() digest_start = time.time() for fastaID, sequence in ursgal.ucore.parseFasta( open(input_params['database'], 'r')): tryptic_peptides = ursgal.ucore.digest( sequence, ('KR', 'C'), # no_missed_cleavages = True ) for p in tryptic_peptides: if 6 <= len(p) <= 40: peptides.add(p) print('Parsing fasta and digesting sequences took {0:1.2f} seconds'.format( time.time() - digest_start)) if sys.platform == 'win32': print( '[ WARNING ] pyahocorasick can not be installed via pip on Windwows at the moment\n' '[ WARNING ] Falling back to UpeptideMapper_v2') class_version = 'UPeptideMapper_v2' upapa_class = uc.unodes['upeptide_mapper_1_0_0'][ 'class'].import_engine_as_python_function(class_version) print('Buffering fasta and mapping {0} peptides'.format(len(peptides))) map_start = time.time() if class_version == 'UPeptideMapper_v2': peptide_mapper = upapa_class(word_len=6) fasta_lookup_name = peptide_mapper.build_lookup_from_file( input_params['database'], force=False, ) args = [list(peptides), fasta_lookup_name] elif class_version == 'UPeptideMapper_v3': peptide_mapper = upapa_class(input_params['database']) fasta_lookup_name = peptide_mapper.fasta_name args = [list(peptides), fasta_lookup_name] elif class_version == 'UPeptideMapper_v4': peptide_mapper = upapa_class(input_params['database']) args = [list(peptides)] p2p_mappings = peptide_mapper.map_peptides(*args) print('Buffering fasta and mapping {0} peptides took {1:1.2f} seconds'. format(len(peptides), time.time() - map_start)) if len(p2p_mappings.keys()) == len(peptides): print('All peptides have been mapped!') else: print('WARNING: Not all peptide have been mapped')
def main(folder): ''' Executes a search with different versions of msgf+ m on an example file from the data from Bruderer et al. usage: ./msgf_plus_version_comparison_qexactive.py <folder containing B_D140314_SGSDSsample1_R01_MSG_T0.mzML.gz> Creates a Venn diagram with the peptides obtained by the different versions. ''' required_example_file = 'B_D140314_SGSDSsample1_R01_MSG_T0.mzML.gz' full_path = os.path.join(folder, required_example_file) if os.path.exists(full_path) is False: print(''' Your specified folder does not contain the required example file: {0} The RAW data from peptideatlas.org (PASS00589, password: WF6554orn) will be downloaded. Please convert to mzML after the download has finished and run this script again. '''.format(required_example_file)) ftp_get_params = { 'ftp_url': 'ftp.peptideatlas.org', 'ftp_login': '******', 'ftp_password': '******', 'ftp_include_ext': [required_example_file.replace('.mzML', '.raw')], 'ftp_output_folder': folder, } uc = ursgal.UController(params=ftp_get_params) uc.fetch_file(engine='get_ftp_files_1_0_0') sys.exit(1) engine_list = [ 'msgfplus_v9979', 'msgfplus_v2016_09_16', ] params = { 'database': os.path.join(os.pardir, 'example_data', 'hs_201303_qs_sip_target_decoy.fasta'), 'modifications': ['C,fix,any,Carbamidomethyl'], 'csv_filter_rules': [['PEP', 'lte', 0.01], ['Is decoy', 'equals', 'false']], 'http_url': 'http://www.uni-muenster.de/Biologie.IBBP.AGFufezan/misc/hs_201303_qs_sip_target_decoy.fasta', 'http_output_folder': os.path.join(os.pardir, 'example_data'), 'machine_offset_in_ppm': -5e-6, 'remove_temporary_files': False } uc = ursgal.UController(profile='QExactive+', params=params) if os.path.exists(params['database']) is False: uc.fetch_file(engine='get_http_files_1_0_0') mzML_file = os.path.join(folder, required_example_file) filtered_files_list = [] for engine in engine_list: unified_result_file = uc.search( input_file=mzML_file, engine=engine, force=False, ) validated_file = uc.validate( input_file=unified_result_file, engine='percolator_2_08', ) filtered_file = uc.execute_misc_engine( input_file=validated_file, engine='filter_csv_1_0_0', ) filtered_files_list.append(filtered_file) uc.visualize( input_files=filtered_files_list, engine='venndiagram', ) return
def main(): """ Example script to compare UPeptideMapper v3 vs v4 results. usage: ./validate_upeptide_mapper_v3_vs_v4.py """ input_params = { "database": os.path.join( os.pardir, "example_data", "Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta", ), "http_url": "https://www.sas.upenn.edu/~sschulze/Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta", "http_output_folder": os.path.join( os.pardir, "example_data", ), } uc = ursgal.UController(params=input_params) if os.path.exists(input_params["database"]) is False: uc.fetch_file(engine="get_http_files_1_0_0") print("Parsing fasta and digesting sequences") peptides = set() max_number_peptides = 1000000000 digest_start = time.time() for fastaID, sequence in ursgal.ucore.parse_fasta( open(input_params["database"], "r")): tryptic_peptides = ursgal.ucore.digest(sequence, ("KR", "C"), no_missed_cleavages=True) for p in tryptic_peptides: if 6 <= len(p) <= 40: if len(peptides) > max_number_peptides: break peptides.add(p) print("Parsing fasta and digesting sequences took {0:1.2f} seconds".format( time.time() - digest_start)) # print(peptides) upapa_class = uc.unodes["upeptide_mapper_1_0_0"][ "class"].import_engine_as_python_function("UPeptideMapper_v3") print("Buffering fasta and mapping {0} peptides with v3".format( len(peptides))) peptide_mapper = upapa_class(input_params["database"]) fasta_lookup_name = peptide_mapper.fasta_name args = [list(peptides), fasta_lookup_name] start_time = time.time() v3_p2p_mappings = peptide_mapper.map_peptides(*args) print("UPeptideMapper v3 mapper took {0}s".format(time.time() - start_time)) print("Done") v3_p2p_mappings = copy.deepcopy(v3_p2p_mappings) upapa_class = uc.unodes["upeptide_mapper_1_0_0"][ "class"].import_engine_as_python_function("UPeptideMapper_v4") print("Buffering fasta and mapping {0} peptides with v4".format( len(peptides))) peptide_mapper = upapa_class(input_params["database"]) args = [ list(peptides), ] start_time = time.time() v4_p2p_mappings = peptide_mapper.map_peptides(*args) print("UPeptideMapper v4 mapper took {0}s".format(time.time() - start_time)) print("Done") assert len(v3_p2p_mappings.keys()) == len(v4_p2p_mappings.keys()) assert list(sorted(v3_p2p_mappings.keys())) == list( sorted(v4_p2p_mappings.keys())) compare_keys = [ "start", "end", "pre", "post", "id", ] num_peps = len(v3_p2p_mappings.keys()) for ppos, peptide in enumerate(list(sorted(v3_p2p_mappings.keys()))): v3_maps = sorted([(d["id"], d["start"], d) for d in v3_p2p_mappings[peptide]]) v4_maps = sorted([(d["id"], d["start"], d) for d in v4_p2p_mappings[peptide]]) print("Comparing peptide #{0}/{1}".format(ppos, num_peps), end="\r") assert len(v3_maps) == len(v4_maps) for pos, (v3_id, v3_start, v3_map_dict) in enumerate(v3_maps): v4_id, v4_start, v4_map_dict = v4_maps[pos] for key in compare_keys: assert v3_map_dict[key] == v4_map_dict[key]
def main(folder): ''' usage: ./human_br_complete_workflow.py <folder_with_human_br_files> This scripts produces the data for figure 3. ''' # Initialize the UController: uc = ursgal.UController(params={ 'enzyme': 'trypsin', 'decoy_generation_mode': 'reverse_protein', }) # MS Spectra, downloaded from http://proteomecentral.proteomexchange.org # via the dataset accession PXD000263 and converted to mzML mass_spec_files = [ '120813OTc1_NQL-AU-0314-LFQ-LCM-SG-01_013.mzML', '120813OTc1_NQL-AU-0314-LFQ-LCM-SG-02_025.mzML', '120813OTc1_NQL-AU-0314-LFQ-LCM-SG-03_033.mzML', '120813OTc1_NQL-AU-0314-LFQ-LCM-SG-04_048.mzML', ] for mass_spec_file in mass_spec_files: if os.path.exists(os.path.join(folder, mass_spec_file)) is False: print( 'Please download RAW files to folder {} and convert to mzML:'. format(folder)) pprint.pprint(mass_spec_files) sys.exit(1) # mods from Wen et al. (2015): modifications = [ # Carbamidomethyl (C) was set as fixed modification 'C,fix,any,Carbamidomethyl', 'M,opt,any,Oxidation', # Oxidation (M) as well as # Deamidated (NQ) were set as optional modification 'N,opt,any,Deamidated', # Deamidated (NQ) were set as optional modification 'Q,opt,any,Deamidated', ] # The target peptide database which will be searched (UniProt Human # reference proteome from July 2013) target_database = 'uniprot_human_UP000005640_created_until_20130707.fasta' # Let's turn it into a target decoy database by reversing peptides: target_decoy_database = uc.execute_misc_engine( input_file=target_database, engine='generate_target_decoy_1_0_0') # OMSSA parameters from Wen et al. (2015): omssa_params = { # (used by default) # -w 'he': '1000', # -he 1000 'zcc': '1', # -zcc 1 'frag_mass_tolerance': '0.6', # -to 0.6 'frag_mass_tolerance_unit': 'da', # -to 0.6 'precursor_mass_tolerance_minus': '10', # -te 10 'precursor_mass_tolerance_plus': '10', # -te 10 'precursor_mass_tolerance_unit': 'ppm', # -teppm 'score_a_ions': False, # -i 1,4 'score_b_ions': True, # -i 1,4 'score_c_ions': False, # -i 1,4 'score_x_ions': False, # -i 1,4 'score_y_ions': True, # -i 1,4 'score_z_ions': False, # -i 1,4 'enzyme': 'trypsin_p', # -e 10 'maximum_missed_cleavages': '1', # -v 1 'precursor_max_charge': '8', # -zh 8 'precursor_min_charge': '1', # -zl 1 'tez': '1', # -tez 1 'precursor_isotope_range': '0,1', # -ti 1 'num_match_spec': '1', # -hc 1 'database': target_decoy_database, 'modifications': modifications, } # MS-GF+ parameters from Wen et al. (2015): msgf_params = { # precursor ion mass tolerance was set to 10 ppm 'precursor_mass_tolerance_unit': 'ppm', # precursor ion mass tolerance was set to 10 ppm 'precursor_mass_tolerance_minus': '10', # precursor ion mass tolerance was set to 10 ppm 'precursor_mass_tolerance_plus': '10', # the max number of optional modifications per peptide were set as 3 # (used by default) # number of allowed isotope errors was set as 1 'enzyme': 'trypsin', # the enzyme was set as trypsin # (used by default) # fully enzymatic peptides were specified, i.e. no non-enzymatic termini 'frag_method': '1', # the fragmentation method selected in the search was CID 'max_pep_length': '45', # the maximum peptide length to consider was set as 45 # the minimum precursor charge to consider if charges are not specified # in the spectrum file was set as 1 'precursor_min_charge': '1', # the maximum precursor charge to consider was set as 8 'precursor_max_charge': '8', # (used by default) # the parameter 'addFeatures' was set as 1 (required for Percolator) # all of the other parameters were set as default # the instrument selected # was High-res 'database': target_decoy_database, 'modifications': modifications, } # X!Tandem parameters from Wen et al. (2015): xtandem_params = { # precursor ion mass tolerance was set to 10 ppm 'precursor_mass_tolerance_unit': 'ppm', # precursor ion mass tolerance was set to 10 ppm 'precursor_mass_tolerance_minus': '10', # precursor ion mass tolerance was set to 10 ppm 'precursor_mass_tolerance_plus': '10', # the fragment ion mass tolerance was set to 0.6 Da 'frag_mass_tolerance': '0.6', # the fragment ion mass tolerance was set to 0.6 Da 'frag_mass_tolerance_unit': 'da', # parent monoisotopic mass isotope error was set as 'yes' 'precursor_isotope_range': '0,1', 'precursor_max_charge': '8', # maximum parent charge of spectrum was set as 8 'enzyme': 'trypsin', # the enzyme was set as trypsin ([RK]|[X]) # the maximum missed cleavage sites were set as 1 'maximum_missed_cleavages': '1', # (used by default) # no model refinement was employed. 'database': target_decoy_database, 'modifications': modifications, } search_engine_settings = [ # not used in Wen et al., so we use the same settings as xtandem ('msamanda_1_0_0_5243', xtandem_params, 'LTQ XL high res'), # not used in Wen et al., so we use the same settings as xtandem ('myrimatch_2_1_138', xtandem_params, 'LTQ XL high res'), # the instrument selected was High-res ('msgfplus_v9979', msgf_params, 'LTQ XL high res'), ('xtandem_jackhammer', xtandem_params, None), ('omssa_2_1_9', omssa_params, None), ] merged_validated_files_3_engines = [] merged_validated_files_5_engines = [] for engine, wen_params, instrument in search_engine_settings: # Initializing the uPLANIT UController class with # our specified modifications and mass spectrometer uc = ursgal.UController(params=wen_params) if instrument is not None: uc.set_profile(instrument) unified_results = [] percolator_validated_results = [] for mzML_file in mass_spec_files: unified_search_results = uc.search( input_file=mzML_file, engine=engine, ) unified_results.append(unified_search_results) validated_csv = uc.validate( input_file=unified_search_results, engine='percolator_2_08', ) percolator_validated_results.append(validated_csv) merged_validated_csv = uc.execute_misc_engine( input_file=percolator_validated_results, engine='merge_csvs_1_0_0') merged_unvalidated_csv = uc.execute_misc_engine( input_file=unified_results, engine='merge_csvs_1_0_0', ) if engine in ["omssa_2_1_9", "xtandem_jackhammer", "msgfplus_v9979"]: merged_validated_files_3_engines.append(merged_validated_csv) merged_validated_files_5_engines.append(merged_validated_csv) uc.params['prefix'] = '5-engines-summary' uc.combine_search_results( input_files=merged_validated_files_5_engines, engine='combine_FDR_0_1', ) uc.params['prefix'] = '3-engines-summary' uc.combine_search_results( input_files=merged_validated_files_3_engines, engine='combine_FDR_0_1', )
def main(folder): """ Executes a search with 5 versions of X!Tandem on an example file from the data from Bruderer et al. 2015. usage: ./xtandem_version_comparison.py <folder containing B_D140314_SGSDSsample1_R01_MSG_T0.mzML> This is a simple example file to show the straightforward comparison of different program versions of X!Tandem, similar to the example script 'xtandem_version_comparison', but analyzing high resolution data which can be better handled by version newer than Jackhammer. One gains approximately 10 percent more peptides with newer versions of X!Tandem. Creates a Venn diagram with the peptides obtained by the different versions. """ required_example_file = "B_D140314_SGSDSsample1_R01_MSG_T0.mzML" if os.path.exists(os.path.join(folder, required_example_file)) is False: print(""" Your specified folder does not contain the required example file: {0} The RAW data from peptideatlas.org (PASS00589, password: WF6554orn) will be downloaded. Please convert to mzML after the download has finished and run this script again. """.format(required_example_file)) ftp_get_params = { "ftp_url": "ftp.peptideatlas.org", "ftp_login": "******", "ftp_password": "******", "ftp_include_ext": [required_example_file.replace(".mzML", ".raw")], "ftp_output_folder": folder, } uc = ursgal.UController(params=ftp_get_params) uc.fetch_file(engine="get_ftp_files_1_0_0") sys.exit(1) engine_list = [ "xtandem_cyclone", "xtandem_jackhammer", "xtandem_sledgehammer", "xtandem_piledriver", "xtandem_vengeance", ] params = { "database": os.path.join(os.pardir, "example_data", "hs_201303_qs_sip_target_decoy.fasta"), "modifications": ["C,fix,any,Carbamidomethyl"], "csv_filter_rules": [["PEP", "lte", 0.01], ["Is decoy", "equals", "false"]], "http_url": "http://www.uni-muenster.de/Biologie.IBBP.AGFufezan/misc/hs_201303_qs_sip_target_decoy.fasta", "http_output_folder": os.path.join(os.pardir, "example_data"), "machine_offset_in_ppm": -5e-6, } uc = ursgal.UController(profile="QExactive+", params=params) if os.path.exists(params["database"]) is False: uc.fetch_file(engine="get_http_files_1_0_0") mzML_file = os.path.join(folder, required_example_file) filtered_files_list = [] for engine in engine_list: unified_result_file = uc.search( input_file=mzML_file, engine=engine, force=False, ) validated_file = uc.validate( input_file=unified_result_file, engine="percolator_2_08", ) filtered_file = uc.execute_misc_engine( input_file=validated_file, engine="filter_csv_1_0_0", ) filtered_files_list.append(filtered_file) uc.visualize( input_files=filtered_files_list, engine="venndiagram_1_1_0", ) return
def search(validation_engine): ''' Executes a grouped search on four example files from the data from Barth et al. usage: ./grouped_search_example.py Searches for peptides including the following potential modifications: oxidation of M, deamidation of N/Q, methylation of E/K/R, N-terminal acetylation, phosphorylation of S/T. After the search, each type of modification is validated seperately. ''' # Initializing the ursgal UController class with # our specified modifications and mass spectrometer uc = ursgal.UController( profile=mass_spectrometer, # 'LTQ XL low res' profile! params=params) # complete workflow: # every spectrum file is searched with every search engine, # results are seperated into groups and validated seperately, # validated results are merged and filtered for targets and PEP <= 0.01. # In the end, all filtered results from all spectrum files are merged # for validation_engine in validation_engines: result_files = [] for n, spec_file in enumerate(spec_files): validated_results = [] for search_engine in search_engines: unified_search_results = uc.search( input_file=spec_file, engine=search_engine, ) # Calculate PEP for every group seperately, therefore need to split the csv first group_list = sorted(groups.keys()) for p, group in enumerate(group_list): if group == '0': uc.params['csv_filter_rules'] = [ [ 'Modifications', 'contains_not', '{0}'.format(groups['1']) ], [ 'Modifications', 'contains_not', '{0}'.format(groups['2']) ], [ 'Modifications', 'contains_not', '{0}'.format(groups['3']) ], [ 'Modifications', 'contains_not', '{0}'.format(groups['4']) ], [ 'Modifications', 'contains_not', '{0}'.format(groups['5']) ], ] else: uc.params['csv_filter_rules'] = [[ 'Modifications', 'contains', '{0}'.format(groups[group]) ]] for other_group in group_list: if other_group == '0' or other_group == group: continue uc.params['csv_filter_rules'].append([ 'Modifications', 'contains_not', '{0}'.format( groups[other_group]) ], ) uc.params['prefix'] = 'grouped-{0}'.format(group) filtered_results = uc.filter_csv( input_file=unified_search_results, ) uc.params['prefix'] = '' validated_search_results = uc.validate( input_file=filtered_results, engine=validation_engine, ) validated_results.append(validated_search_results) uc.params['prefix'] = 'file{0}'.format(n) validated_results_from_all_engines = uc.merge_csvs( input_files=sorted(validated_results), ) uc.params['prefix'] = '' uc.params['csv_filter_rules'] = [ ['Is decoy', 'equals', 'false'], ['PEP', 'lte', 0.01], ] filtered_validated_results = uc.filter_csv( input_file=validated_results_from_all_engines) result_files.append(filtered_validated_results) results_all_files = uc.merge_csvs(input_files=sorted(result_files), ) return results_all_files
def main(input_file=None, arcpp_pep_file=None): """ Post-process glycopeptide identifications in order to ensure that glycopeptide identifications are substantiated by at least two PSMs and two replicates. Furthermore, split results into N-glycans, non-canonical N-glycans and O-glycans. Usage: python <script_name.py> <path_to_glycopeptide_PSMs> <path_to_ArcPP_peptides_file> """ params = { "psm_defining_colnames": [ "Spectrum Title", "Sequence", "Modifications", "Charge", "Is decoy", ], } uc = ursgal.UController(params=params) arcpp_glycopeps = {} with open(arcpp_pep_file, "r") as arcpp_in: arcpp_csv = csv.DictReader(arcpp_in) for line_dict in arcpp_csv: glycopep, glycan_type = line_to_pep_unimod_glyc(line_dict) if glycopep not in arcpp_glycopeps.keys(): arcpp_glycopeps[glycopep] = [] arcpp_glycopeps[glycopep].append(line_dict) replicate_lookup = {} for pride in PRIDE_ids: replicate_lookup[pride] = {} file_description = os.path.join( "file_descriptions", "{0}_file_descriptions.csv".format(pride) ) with open(file_description, "r") as descr_in: descr_csv = csv.DictReader(descr_in) for line_dict in descr_csv: strain = line_dict["Strain"] file_name = line_dict["Raw file name"].split(".")[0] replicate = line_dict["Replicate"] replicate_lookup[pride][file_name] = { "strain": strain, "rep": replicate, } true_n_glycopeps = ddict(dict) non_standard_n_glycopeps = ddict(dict) o_glycopeps = ddict(dict) all_strain = set() with open(input_file, "r") as glyco_in: glyco_csv = csv.DictReader(glyco_in) fieldnames = glyco_csv.fieldnames for line_dict in glyco_csv: protein = line_dict["Protein ID"] if protein.startswith("sp|"): continue # peptide = line_dict['Sequence'] spec_id = line_dict["Spectrum Title"] file_name = line_dict["Spectrum Title"].split(".")[0] dataset = line_dict["Dataset"] strain = replicate_lookup[dataset][file_name]["strain"] rep = replicate_lookup[dataset][file_name]["rep"] glycopep, glycan_type = line_to_pep_unimod_glyc(line_dict) if strain not in true_n_glycopeps.keys(): true_n_glycopeps[strain] = {} all_strain.add(strain) if "n_glycan" in glycan_type: if glycopep not in true_n_glycopeps[strain].keys(): true_n_glycopeps[strain][glycopep] = { "frag_ions": set(), "specs": set(), "reps": set(), "line_dicts": [], } true_n_glycopeps[strain][glycopep]["specs"].add(spec_id) true_n_glycopeps[strain][glycopep]["reps"].add("#".join([dataset, rep])) true_n_glycopeps[strain][glycopep]["line_dicts"].append(line_dict) true_n_glycopeps[strain][glycopep]["frag_ions"].add( line_dict["MS2 Glycopep Frag Ions Present"] ) elif "o_glycan" in glycan_type and len(glycan_type) == 1: if strain not in o_glycopeps.keys(): o_glycopeps[strain] = {} all_strain.add(strain) if glycopep not in o_glycopeps[strain].keys(): o_glycopeps[strain][glycopep] = { "frag_ions": set(), "specs": set(), "reps": set(), "line_dicts": [], } o_glycopeps[strain][glycopep]["specs"].add(spec_id) o_glycopeps[strain][glycopep]["reps"].add("#".join([dataset, rep])) o_glycopeps[strain][glycopep]["line_dicts"].append(line_dict) o_glycopeps[strain][glycopep]["frag_ions"].add( line_dict["MS2 Glycopep Frag Ions Present"] ) elif "true_non_standard_n_glycan" in glycan_type: if strain not in non_standard_n_glycopeps.keys(): non_standard_n_glycopeps[strain] = {} all_strain.add(strain) if glycopep not in non_standard_n_glycopeps[strain].keys(): non_standard_n_glycopeps[strain][glycopep] = { "frag_ions": set(), "specs": set(), "reps": set(), "line_dicts": [], } non_standard_n_glycopeps[strain][glycopep]["specs"].add(spec_id) non_standard_n_glycopeps[strain][glycopep]["reps"].add( "#".join([dataset, rep]) ) non_standard_n_glycopeps[strain][glycopep]["line_dicts"].append( line_dict ) non_standard_n_glycopeps[strain][glycopep]["frag_ions"].add( line_dict["MS2 Glycopep Frag Ions Present"] ) else: print(glycan_type) count_true_n_glycopeps = set() count_true_n_glycopeps_arcpp = set() count_o_glycopeps = set() count_o_glycopeps_arcpp = set() count_non_standard_glycopeps = set() count_non_standard_glycopeps_arcpp = set() output_line_dicts_n = [] output_line_dicts_non_standard_n = [] output_line_dicts_o = [] for strain in all_strain: print(strain) for glycopep in true_n_glycopeps[strain].keys(): if "True" not in true_n_glycopeps[strain][glycopep]["frag_ions"]: continue if len(true_n_glycopeps[strain][glycopep]["specs"]) < 2: continue if len(true_n_glycopeps[strain][glycopep]["reps"]) < 2: continue count_true_n_glycopeps.add(glycopep) if glycopep not in arcpp_glycopeps.keys(): continue count_true_n_glycopeps_arcpp.add(glycopep) output_line_dicts_n.extend(true_n_glycopeps[strain][glycopep]["line_dicts"]) for glycopep in o_glycopeps[strain].keys(): if "True" not in o_glycopeps[strain][glycopep]["frag_ions"]: continue if len(o_glycopeps[strain][glycopep]["specs"]) < 2: continue if len(o_glycopeps[strain][glycopep]["reps"]) < 2: continue count_o_glycopeps.add(glycopep) if glycopep not in arcpp_glycopeps.keys(): continue count_o_glycopeps_arcpp.add(glycopep) output_line_dicts_o.extend(o_glycopeps[strain][glycopep]["line_dicts"]) for glycopep in non_standard_n_glycopeps[strain].keys(): if "True" not in non_standard_n_glycopeps[strain][glycopep]["frag_ions"]: continue if len(non_standard_n_glycopeps[strain][glycopep]["specs"]) < 2: continue if len(non_standard_n_glycopeps[strain][glycopep]["reps"]) < 2: continue count_non_standard_glycopeps.add(glycopep) if glycopep not in arcpp_glycopeps.keys(): continue count_non_standard_glycopeps_arcpp.add(glycopep) output_line_dicts_non_standard_n.extend( non_standard_n_glycopeps[strain][glycopep]["line_dicts"] ) print( """ True N-glyco: {0} True N-glyco ArcPP: {1} O-glyco: {2} O-glyco ArcPP: {3} Non-standard N-glyco: {4} Non-standard N-glyco ArcPP: {5} """.format( len(count_true_n_glycopeps), len(count_true_n_glycopeps_arcpp), len(count_o_glycopeps), len(count_o_glycopeps_arcpp), len(count_non_standard_glycopeps), len(count_non_standard_glycopeps_arcpp), ) ) csv_kwargs = {} if sys.platform == "win32": csv_kwargs["lineterminator"] = "\n" else: csv_kwargs["lineterminator"] = "\r\n" csv_out_name = "ArcPP_N_glyco_filtered_peptides_2rep.csv" with open(csv_out_name, "w") as csv_out: csv_writer = csv.DictWriter(csv_out, fieldnames=fieldnames, **csv_kwargs) csv_writer.writeheader() for out_dict in output_line_dicts_n: csv_writer.writerow(out_dict) csv_out_name = "ArcPP_only_O_glyco_filtered_peptides_2rep.csv" with open(csv_out_name, "w") as csv_out: csv_writer = csv.DictWriter(csv_out, fieldnames=fieldnames, **csv_kwargs) csv_writer.writeheader() for out_dict in output_line_dicts_o: csv_writer.writerow(out_dict) csv_out_name = "ArcPP_only_non_canonical_n_glyco_filtered_peptides_2rep.csv" with open(csv_out_name, "w") as csv_out: csv_writer = csv.DictWriter(csv_out, fieldnames=fieldnames, **csv_kwargs) csv_writer.writeheader() for out_dict in output_line_dicts_non_standard_n: csv_writer.writerow(out_dict)
def setUp(self): self.upapa_5 = umama.UPeptideMapper(word_len=5) self.upapa_5.build_lookup(fasta_name='Test.fasta', fasta_stream=TEST_FASTA) self.uc = ursgal.UController(verbose=False)
def main(): """ Executes a search with OMSSA, XTandem and MS-GF+ on the BSA1.mzML input_file usage: ./simple_example_search.py Note: Myrimatch does not work with this file. To use MSAmanda on unix platforms, please install mono (http://www.mono-project.com/download) """ uc = ursgal.UController( profile="LTQ XL low res", params={ "database": os.path.join(os.pardir, "example_data", "BSA.fasta"), "modifications": [ "M,opt,any,Oxidation", # Met oxidation "C,fix,any,Carbamidomethyl", # Carbamidomethylation "*,opt,Prot-N-term,Acetyl", # N-Acteylation ], # 'peptide_mapper_class_version' : 'UPeptideMapper_v2', }, ) if sys.maxsize > 2 ** 32: xtandem = "xtandem_vengeance" else: xtandem = "xtandem_sledgehammer" engine_list = [ "omssa", xtandem, "msgfplus_v2016_09_16", ] mzML_file = os.path.join( os.pardir, "example_data", "BSA_simple_example_search", "BSA1.mzML" ) if os.path.exists(mzML_file) is False: uc.params[ "http_url" ] = "http://sourceforge.net/p/open-ms/code/HEAD/tree/OpenMS/share/OpenMS/examples/BSA/BSA1.mzML?format=raw" uc.params["http_output_folder"] = os.path.dirname(mzML_file) uc.fetch_file( engine="get_http_files_1_0_0", ) try: shutil.move("{0}?format=raw".format(mzML_file), mzML_file) except: shutil.move("{0}format=raw".format(mzML_file), mzML_file) unified_file_list = [] for engine in engine_list: unified_search_result_file = uc.search( input_file=mzML_file, engine=engine, force=False ) unified_file_list.append(unified_search_result_file) uc.visualize( input_files=unified_file_list, engine="venndiagram_1_1_0", ) return
def main(folder=None, database=None, enzyme=None): """ Example workflow to perform a open modification search with three independent search engines across all mzML files of a given folder and to statistically post-process and combine the results of all searches. Usage: ./open_modification_search_incl_combined_pep.py <mzML_folder> <database> <enzyme> """ # For this particular dataset, two enzymes were used, namely gluc and trypsin. mzml_files = [] for mzml in glob.glob(os.path.join(folder, "*.mzML")): mzml_files.append(mzml) mass_spectrometer = "QExactive+" validation_engine = "percolator_3_4_0" search_engines = ["msfragger_2_3", "pipi_1_4_6", "moda_v1_61"] params = { "modifications": ["C,fix,any,Carbamidomethyl"], "csv_filter_rules": [ ["Is decoy", "equals", "false"], ["PEP", "lte", 0.01], ], "frag_mass_tolerance_unit": "ppm", "frag_mass_tolerance": 20, "precursor_mass_tolerance_unit": "ppm", "precursor_mass_tolerance_plus": 5, "precursor_mass_tolerance_minus": 5, "moda_high_res": False, "max_mod_size": 4000, "min_mod_size": -200, "precursor_true_units": "ppm", "precursor_true_tolerance": 5, "percolator_post_processing": "mix-max", "psm_defining_colnames": [ "Spectrum Title", "Sequence", "Modifications", "Charge", "Is decoy", "Mass Difference", ], "database": database, "enzyme": enzyme, } uc = ursgal.UController( profile=mass_spectrometer, params=params, ) # This will hold input to combined PEP engine combined_pep_input = defaultdict(list) # This dictionary will help organize which results to merge all_merged_results = defaultdict(list) for search_engine in search_engines: # The modification size for MSFragger is configured through precursor mass tolerance if search_engine == "msfragger_2_3": uc.params.update({ "precursor_mass_tolerance_unit": "da", "precursor_mass_tolerance_plus": 4000, "precursor_mass_tolerance_minus": 200, }) for n, spec_file in enumerate(mzml_files): # 1. convert to MGF mgf_file = uc.convert( input_file=spec_file, engine="mzml2mgf_2_0_0", ) # 2. do the actual search raw_search_results = uc.search_mgf( input_file=mgf_file, engine=search_engine, ) # reset precursor mass tolerance just in case it was previously changed uc.params.update({ "precursor_mass_tolerance_unit": "ppm", "precursor_mass_tolerance_plus": 5, "precursor_mass_tolerance_minus": 5, }) # 3. convert files to csv csv_search_results = uc.convert( input_file=raw_search_results, engine=None, guess_engine=True, ) # 4. protein mapping. mapped_csv_search_results = uc.execute_misc_engine( input_file=csv_search_results, engine="upeptide_mapper_1_0_0", ) # 5. Convert csv to unified ursgal csv format: unified_search_results = uc.execute_misc_engine( input_file=mapped_csv_search_results, engine="unify_csv_1_0_0", merge_duplicates=False, ) # 6. Validate the results validated_csv = uc.validate( input_file=unified_search_results, engine=validation_engine, ) # save the validated input for combined pep # Eventually, each sample will have 3 files correpsonding to the 3 search engines combined_pep_input["sample_{0}".format(n)].append(validated_csv) filtered_validated_results = uc.execute_misc_engine( input_file=validated_csv, engine="filter_csv_1_0_0", merge_duplicates=False, ) all_merged_results["percolator_only"].append( filtered_validated_results) # combined pep uc.params.update({ "csv_filter_rules": [ ["Is decoy", "equals", "false"], ["combined PEP", "lte", 0.01], ], "psm_defining_colnames": [ "Spectrum Title", "Sequence", "Modifications", "Charge", "Is decoy", ], }) for sample in combined_pep_input.keys(): combine_results = uc.execute_misc_engine( input_file=combined_pep_input[sample], engine="combine_pep_1_0_0", ) filtered_validated_results = uc.execute_misc_engine( input_file=combine_results, engine="filter_csv_1_0_0", ) all_merged_results["combined_pep"].append(filtered_validated_results) # separately merge results from the two types of validation techniques # We also add back "Mass Difference" to columns defining a PSM to avoid merging mass differences uc.params.update({ "psm_defining_colnames": [ "Spectrum Title", "Sequence", "Modifications", "Charge", "Is decoy", "Mass Difference", ], }) for validation_type in all_merged_results.keys(): if validation_type == "percolator_only": uc.params["psm_colnames_to_merge_multiple_values"] = { "PEP": "min_value", } else: uc.params["psm_colnames_to_merge_multiple_values"] = { "combined PEP": "min_value", "Bayes PEP": "min_value", } uc.params["prefix"] = "All_{0}".format( validation_type) # helps recognize files easily merged_results_one_rep = uc.execute_misc_engine( input_file=all_merged_results[validation_type], engine="merge_csvs_1_0_0", merge_duplicates=True, ) uc.params["prefix"] = ""
def main(folder=None, enzyme=None, target_decoy_database=None): """ Workflow for the analysis a dataset with one run per sample. Usage: python <script_name.py> <folder_with_mzML> <enzyme> <path_to_database> """ # define folder with mzML_files as sys.argv[1] mzML_files = [] for mzml in glob.glob(os.path.join(folder, "*.mzML")): mzML_files.append(os.path.basename(mzml)) offset_files = [] for sample in offsets.keys(): for spec_file in offsets[sample].keys(): offset_files.append(spec_file) for mzml in mzML_files: if mzml not in offset_files: print( "mzML file in folder but NOT in offset dict: {}".format(mzml)) exit() for sample in offset_files: if sample not in mzML_files: print( "Sample in offset dict but mzML file NOT in folder: {}".format( sample)) exit() mass_spectrometer = "QExactive+" search_engines = [ "xtandem_vengeance", "msfragger_2_3", "msgfplus_v2019_07_03", ] validation_engine = "percolator_3_4_0" params = { "database": target_decoy_database, "enzyme": enzyme, "precursor_mass_tolerance_minus": 10, "precursor_mass_tolerance_plus": 10, "frag_mass_tolerance": 10, "frag_mass_tolerance_unit": "ppm", "rounded_mass_decimals": 2, "-xmx": "32g", "peptide_mapper_class_version": "UPeptideMapper_v4", "use_pyqms_for_mz_calculation": True, "percolator_post_processing": "mix-max", "psm_defining_colnames": [ "Spectrum Title", "Sequence", "Modifications", "Charge", "Is decoy", ], "max_missed_cleavages": 2, } # glycans are defined as variable modifications # Hex and Hex(1)HexA(1) (=1427) are existing unimod modifications Hvo_Glyco = [ "", "N,opt,any,Hex", "N,opt,any,1427", "N,opt,any,Hex(1)HexA(2),C18H26O17", "N,opt,any,Hex(1)HexA(3),C24H34O23", "N,opt,any,Hex(1)HexA(2)MeHexA(1)Hex(1),C31H46O28", "N,opt,any,Hex(1)HexA(2)MeHexA(1),C25H36O23", "N,opt,any,SO3Hex(1),C6H10O8S1", "N,opt,any,SO3Hex(1)Hex(1),C12H20O13S1", "N,opt,any,SO3Hex(1)Hex(2),C18H30O18S1", "N,opt,any,SO3Hex(1)Hex(2)dHex(1),C24H40O22S1", ] uc = ursgal.UController(profile=mass_spectrometer, params=params) combined_pep_result_files = [] for n, sample in enumerate(sorted(offsets.keys(), reverse=True)): validated_result_files = [] for search_engine in search_engines: engine_results_validated = [] for n, mod in enumerate(Hvo_Glyco): results_one_mod = [] for spec_file in sorted(offsets[sample].keys()): basename = spec_file dirname = os.path.join(folder) offset = offsets[sample][basename] spec_file_path = os.path.join(dirname, basename) if offset == "skip": continue uc.params["machine_offset_in_ppm"] = offset mgf_file = uc.convert( input_file=spec_file_path, engine="mzml2mgf_2_0_0", ) if n == 0: uc.params["modifications"] = [ "C,fix,any,Carbamidomethyl", "M,opt,any,Oxidation", "*,fix,N-term,iTRAQ4plex", "K,opt,any,iTRAQ4plex", "Y,opt,any,iTRAQ4plex", ] else: uc.params["modifications"] = [ "C,fix,any,Carbamidomethyl", "M,opt,any,Oxidation", "*,fix,N-term,iTRAQ4plex", "K,opt,any,iTRAQ4plex", "Y,opt,any,iTRAQ4plex", "S,opt,any,Hex(2)", "T,opt,any,Hex(2)", ] uc.params["modifications"].append(mod) uc.params["prefix"] = mod.split(",")[3] search_result = uc.search_mgf( input_file=mgf_file, engine=search_engine, ) uc.params["prefix"] = "" converted_result = uc.convert( input_file=search_result, guess_engine=True, ) mapped_results = uc.execute_misc_engine( input_file=converted_result, engine="upeptide_mapper", ) unified_search_results = uc.execute_misc_engine( input_file=mapped_results, engine="unify_csv") results_one_mod.append(unified_search_results) uc.params["prefix"] = sample merged_1engine_1mod_1sample = uc.execute_misc_engine( input_file=results_one_mod, engine="merge_csvs", # merge_duplicates=True, ) uc.params["prefix"] = "" # engine_results_unvalidated.append(merged_1engine_1mod_1sample) validated_csv = uc.validate( input_file=merged_1engine_1mod_1sample, engine=validation_engine, ) engine_results_validated.append(validated_csv) merged_1engine_all_mods_validated = uc.execute_misc_engine( input_file=engine_results_validated, engine="merge_csvs", merge_duplicates=False, ) validated_result_files.append(merged_1engine_all_mods_validated) uc.params["prefix"] = sample combined_pep_validated = uc.combine_search_results( input_files=validated_result_files, engine="combine_pep_1_0_0", ) uc.params["prefix"] = "" uc.params["csv_filter_rules"] = [ # ["Is decoy", "equals", "false"], ["combined PEP", "lte", 0.01], ["Conflicting uparam", "contains_not", "enzyme"], ] filtered_validated_results = uc.execute_misc_engine( input_file=combined_pep_validated, engine="filter_csv", ) combined_pep_result_files.append(filtered_validated_results) # uc.params['peptide_forest_initial_engine'] = 'msfragger_2_3' # uc.params['peptide_forest_file_params'] = {} # uc.params['prefix'] = 'peptide_forest_' + sample # peptide_forest_validated = uc.validate( # input_file=unvalidated_result_files, # engine='peptide_forest', # ) # uc.params['csv_filter_rules'] = [ # ['Is decoy', 'equals', 'false'], # ['q-value_RF-reg','lte', 0.01], # ['Conflicting uparam', 'contains_not', 'enzyme'], # ] # filtered_peptide_forest = uc.execute_misc_engine( # input_file = peptide_forest_validated, # engine='filter_csv', # ) # peptide_forest_result_files.append(filtered_peptide_forest) uc.params["prefix"] = "" results_all_combined_pep = uc.execute_misc_engine( input_file=combined_pep_result_files, engine="merge_csvs", merge_duplicates=True, ) uc.params.update({ "validation_score_field": "combined PEP", "bigger_scores_better": False, "num_compared_psms": 10, "accept_conflicting_psms": False, "threshold_is_log10": True, "score_diff_threshold": 1, "psm_defining_colnames": [ "Spectrum Title", "Sequence", ], }) sanitized_combined_results = uc.execute_misc_engine( input_file=results_all_combined_pep, engine="sanitize_csv", ) uc.params["prefix"] = "Glyco_everywhere" uc.params["csv_filter_rules"] = [ ["Modifications", "contains", "Hex"], # ['Sequence','contains_glycosite', 'N[^P][ST]'] ] Glyco_filtered = uc.execute_misc_engine( input_file=sanitized_combined_results, engine="filter_csv", ) uc.params["prefix"] = "Glyco_glycosite" uc.params["csv_filter_rules"] = [ ["Modifications", "contains", "Hex"], ["Sequence", "contains_glycosite", "N[^P][ST]"], ] Glyco_filtered = uc.execute_misc_engine( input_file=sanitized_combined_results, engine="filter_csv", ) uc.params["prefix"] = ""
def analyze(collector): ''' Simle analysis script for the cascade search, counting the number of identified peptides (combination of peptide sequence and modifications) and PSMs (additionally include the spectrum ID) ''' mod_list = ['Oxidation', 'Deamidated', 'Methyl', 'Acetyl', 'Phospho'] fieldnames = [ 'approach', 'count_type', 'validation_engine', 'unmodified', 'multimodified' ] + mod_list + ['total'] csv_writer = csv.DictWriter(open('cascade_results.csv', 'w'), fieldnames) csv_writer.writeheader() uc = ursgal.UController() uc.params['validation_score_field'] = 'PEP' uc.params['bigger_scores_better'] = False # Count the number of identified peptides and PSMs for the different modifications # Spectra with multiple PSMs are sanitized, i.e. only the PSM with best PEP score is counted # and only if the best hit has a PEP that is at least two orders of # magnitude smaller than the others for validation_engine, result_file in collector.items(): counter_dict = {'psm': ddict(set), 'pep': ddict(set)} grouped_psms = uc._group_psms(result_file, validation_score_field='PEP', bigger_scores_better=False) for spec_title, grouped_psm_list in grouped_psms.items(): best_score, best_line_dict = grouped_psm_list[0] if len(grouped_psm_list) > 1: second_best_score, second_best_line_dict = grouped_psm_list[1] best_peptide_and_mod = best_line_dict[ 'Sequence'] + best_line_dict['Modifications'] second_best_peptide_and_mod = second_best_line_dict[ 'Sequence'] + second_best_line_dict['Modifications'] if best_peptide_and_mod == second_best_peptide_and_mod: line_dict = best_line_dict elif best_line_dict['Sequence'] == second_best_line_dict[ 'Sequence']: if best_score == second_best_score: line_dict = best_line_dict else: if (-1 * math.log10(best_score)) - ( -1 * math.log10(second_best_score)) >= 2: line_dict = best_line_dict else: continue else: if (-1 * math.log10(best_score)) - ( -1 * math.log10(second_best_score)) >= 2: line_dict = best_line_dict else: continue else: line_dict = best_line_dict count = 0 for mod in mod_list: if mod in line_dict['Modifications']: count += 1 key_2_add = '' if count == 0: key_2_add = 'unmodified' elif count >= 2: key_2_add = 'multimodified' elif count == 1: for mod in mod_list: if mod in line_dict['Modifications']: key_2_add = mod break # for peptide identification comparison counter_dict['pep'][key_2_add].add(line_dict['Sequence'] + line_dict['Modifications']) # for PSM comparison counter_dict['psm'][key_2_add].add(line_dict['Spectrum Title'] + line_dict['Sequence'] + line_dict['Modifications']) for counter_key, count_dict in counter_dict.items(): dict_2_write = { 'approach': 'cascade', 'count_type': counter_key, 'validation_engine': validation_engine } total_number = 0 for key, obj_set in count_dict.items(): dict_2_write[key] = len(obj_set) total_number += len(obj_set) dict_2_write['total'] = total_number csv_writer.writerow(dict_2_write) return
# encoding: utf-8 ''' Test the unify_csv function for myrimatch engine ''' import ursgal import csv import pickle import os modifications = [ 'C,fix,any,Carbamidomethyl', # Carbamidomethylation ] R = ursgal.UController(params={'modifications': modifications}) R.map_mods() scan_rt_lookup = pickle.load( open(os.path.join('tests', 'data', '_test_ursgal_lookup.pkl'), 'rb')) unify_csv_main = R.unodes['unify_csv_1_0_0'][ 'class'].import_engine_as_python_function() input_csv = os.path.join('tests', 'data', 'myrimatch_2_1_138', 'test_BSA1_myrimatch_2_1_138.csv') output_csv = os.path.join('tests', 'data', 'myrimatch_2_1_138', 'test_BSA1_myrimatch_2_1_138_unified.csv') unify_csv_main( input_file=input_csv, output_file=output_csv, scan_rt_lookup=scan_rt_lookup,
def search(validation_engine): ''' Executes a cascade search on four example files from the data from Barth et al. usage: ./cascade_search_example.py Searches for peptides using a cascade search approach similar to Kertesz-Farkas et al. for which spectra were first searched for unmodified peptides, followed by consecutive searches for the following modifications: oxidation of M, deamidation of N/Q, methylation of E/K/R, N-terminal acetylation, phosphorylation of S/T. After each step, spectra with a PSM below 1 % PEP were removed. ''' # Initializing the uPLANIT UController class with # our specified modifications and mass spectrometer uc = ursgal.UController( profile=mass_spectrometer, # 'LTQ XL low res' profile! params=params) # complete workflow for every level of the cascade: # every spectrum file is searched with every search engine, # results are validated seperately, # validated results are merged and filtered for targets and PEP <= 0.01. def workflow(spec_file, prefix=None, validation_engine=None, filter_before_validation=False, force=False): validated_results = [] # Convert mzML to MGF outside the loop, so this step is not repeated in # the loop mgf_spec_file = uc.convert(input_file=spec_file, engine='mzml2mgf_1_0_0') for search_engine in search_engines: uc.params['prefix'] = prefix unified_search_results = uc.search( input_file=mgf_spec_file, engine=search_engine, force=force, ) uc.params['prefix'] = '' if filter_before_validation == True: uc.params['csv_filter_rules'] = [[ 'Modifications', 'contains', '{0}'.format(cascade[level][1].split(',')[3]) ]] filtered_search_results = uc.execute_misc_engine( input_file=unified_search_results, engine='filter_csv_1_0_0') else: filtered_search_results = unified_search_results validated_search_results = uc.validate( input_file=filtered_search_results, engine=validation_engine, force=force, ) validated_results.append(validated_search_results) validated_results_from_all_engines = uc.execute_misc_engine( input_file=sorted(validated_results), engine='merge_csvs_1_0_0', force=force, ) uc.params['csv_filter_rules'] = [ ['Is decoy', 'equals', 'false'], ['PEP', 'lte', 0.01], ] filtered_validated_results = uc.execute_misc_engine( input_file=validated_results_from_all_engines, engine='filter_csv_1_0_0') return filtered_validated_results result_files = [] for spec_file in spec_files: spectra_with_PSM = set() for level in sorted(cascade.keys()): uc.params['modifications'] = cascade[level] if level == '0': results = workflow(spec_file, validation_engine=validation_engine, prefix='cascade-lvl-{0}'.format(level)) else: uc.params['scan_exclusion_list'] = list(spectra_with_PSM) results = workflow(spec_file, validation_engine=validation_engine, filter_before_validation=True, force=True, prefix='cascade-lvl-{0}'.format(level)) result_files.append(results) # spectrum IDs for PSMs are written into an exclusion list for the next level of the cascade search, # these spectra will b excluded during mzml2mgf conversion with open(results) as in_file: csv_input = csv.DictReader(in_file) for line_dict in csv_input: spectra_with_PSM.add(line_dict['Spectrum ID']) print( 'Number of spectra that will be removed for the next cacade level: {0}' .format(len(spectra_with_PSM))) results_all_files = uc.execute_misc_engine( input_file=sorted(result_files), engine='merge_csvs_1_0_0', ) return results_all_files
#!/usr/bin/env python3.4 # encoding: utf-8 ''' Test the unify_csv function for omssa engine ''' import ursgal import csv import pickle import os R = ursgal.UController(profile='LTQ XL low res', params={ 'database': os.path.join('tests', 'data', 'BSA.fasta'), }, force=False) scan_rt_lookup = pickle.load( open(os.path.join('tests', 'data', '_test_ursgal_lookup.pkl'), 'rb')) unify_csv_main = R.unodes['unify_csv_1_0_0'][ 'class'].import_engine_as_python_function() input_csv = os.path.join('tests', 'data', 'omssa_2_1_9', 'test_BSA1_omssa_2_1_9.csv') output_csv = os.path.join('tests', 'data', 'omssa_2_1_9', 'test_BSA1_omssa_2_1_9_unified.csv') unify_csv_main(
def main(): ''' Executes a search with OMSSA, XTandem and MS-GF+ on the BSA1.mzML input_file usage: ./simple_example_search.py Note: Myrimatch does not work with this file. To use MSAmanda on unix platforms, please install mono (http://www.mono-project.com/download) ''' uc = ursgal.UController( profile='LTQ XL low res', params={ 'database': os.path.join(os.pardir, 'example_data', 'BSA.fasta'), 'modifications': [ 'M,opt,any,Oxidation', # Met oxidation 'C,fix,any,Carbamidomethyl', # Carbamidomethylation '*,opt,Prot-N-term,Acetyl', # N-Acteylation ], # 'peptide_mapper_class_version' : 'UPeptideMapper_v2', }) if sys.maxsize > 2**32: xtandem = 'xtandem_vengeance' else: xtandem = 'xtandem_sledgehammer' engine_list = [ 'omssa', xtandem, 'msgfplus_v2016_09_16', ] mzML_file = os.path.join(os.pardir, 'example_data', 'BSA_simple_example_search', 'BSA1.mzML') if os.path.exists(mzML_file) is False: uc.params[ 'http_url'] = 'http://sourceforge.net/p/open-ms/code/HEAD/tree/OpenMS/share/OpenMS/examples/BSA/BSA1.mzML?format=raw' uc.params['http_output_folder'] = os.path.dirname(mzML_file) uc.fetch_file(engine='get_http_files_1_0_0', ) try: shutil.move('{0}?format=raw'.format(mzML_file), mzML_file) except: shutil.move('{0}format=raw'.format(mzML_file), mzML_file) unified_file_list = [] for engine in engine_list: unified_search_result_file = uc.search(input_file=mzML_file, engine=engine, force=False) unified_file_list.append(unified_search_result_file) uc.visualize( input_files=unified_file_list, engine='venndiagram', ) return