Пример #1
0
 def setUp(self):
     self.uc = ursgal.UController()
     self.uc.params['translations'] = {}
     self.uc.UNODE_UPARAMS.update({
         'test_ions': {
             'style': 'test_style_1',
             'ukey': 'test_ions',
             'ukey_translated': '__test_00000_ions',
             'default_value': 'Yes',
             'default_value_translated': True,
             'uvalue_style_translation': {
                 'Yes': True,
                 'No': False
             },
             'triggers_rerun': True
         },
         'score_test_ions': {
             'style': 'test_style_1',
             'ukey': 'score_test_ions',
             'ukey_translated': '__test_00000_ions',
             'default_value': True,
             'default_value_translated': 'Please yes translate',
             'uvalue_style_translation': {
                 True: 'Please yes translate',
                 False: 'No please leave me alone'
             },
             'triggers_rerun': True
         },
         'list_of_things': {
             'style': 'test_style_1',
             'ukey': 'list_of_things',
             'ukey_translated': 'list_of_things',
             'default_value': [True, True, True],
             'default_value_translated': [True, True, True],
             'uvalue_style_translation': {},
             'triggers_rerun': True
         }
     })
Пример #2
0
    def run(self):
        '''
        Download all resources from our webpage to ursgal/resources.

        '''
        working_directory = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), )
        if os.path.exists(os.path.join(working_directory, 'ursgal')) is False:
            print('Could not find ursgal directory')
            sys.exit(1)
        import ursgal
        uc = ursgal.UController()
        downloaded_zips = uc.download_resources(resources=None)
        if len(downloaded_zips) == 0:
            print(
                '[ INFO ] No engines were downloaded, all should be available')
        else:
            print('[ INFO ] Downloaded and installed {0} engine(s)'.format(
                len(downloaded_zips)))
            for engine, zip_file in downloaded_zips:
                print(
                    '[ INFO ] Engine: {0} has been installed from {1}'.format(
                        engine, zip_file))
Пример #3
0
def main():
    '''
    Simple example script how to generate a target decoy database.

    Note:
        By default a 'shuffled peptide preserving cleavage sites' database is 
        generated. For this script a 'reverse protein' database is generated.

    usage:

        ./target_decoy_generation_example.py

    '''
    params = {
        'enzyme': 'trypsin',
        'decoy_generation_mode': 'reverse_protein',
    }

    fasta_database_list = [
        os.path.join(
            os.pardir,
            'example_data',
            'BSA.fasta'
        )
    ]

    uc = ursgal.UController(
        params=params
    )

    new_target_decoy_db_name = uc.execute_misc_engine(
        input_file=fasta_database_list,
        engine='generate_target_decoy_1_0_0',
        output_file_name='my_BSA_target_decoy.fasta',
    )
    print('Generated target decoy database: {0}'.format(
        new_target_decoy_db_name))
Пример #4
0
def main():
    """
    Example for plotting a simple Venn diagram with single ursgal csv files.

    usage:
        ./simple_venn_example.py


    """
    uc = ursgal.UController(
        profile="LTQ XL low res",
        params={
            "visualization_label_positions": {
                "0": "omssa",
                "1": "xtandem"
            }
        },
    )

    file_list = [
        os.path.join(os.pardir, "tests", "data", "omssa_2_1_9",
                     "test_BSA1_omssa_2_1_9.csv"),
        os.path.join(
            os.pardir,
            "tests",
            "data",
            "xtandem_sledgehammer",
            "test_BSA1_xtandem_sledgehammer.csv",
        ),
    ]

    uc.visualize(
        input_files=file_list,
        engine="venndiagram_1_1_0",
        force=True,
    )
    return
Пример #5
0
 def setUp(self):
     self.uc = ursgal.UController()
     self.uc.params["translations"] = {}
     self.uc.UNODE_UPARAMS.update(
         {
             "test_ions": {
                 "style": "test_style_1",
                 "ukey": "test_ions",
                 "ukey_translated": "__test_00000_ions",
                 "default_value": "Yes",
                 "default_value_translated": True,
                 "uvalue_style_translation": {"Yes": True, "No": False},
                 "triggers_rerun": True,
             },
             "score_test_ions": {
                 "style": "test_style_1",
                 "ukey": "score_test_ions",
                 "ukey_translated": "__test_00000_ions",
                 "default_value": True,
                 "default_value_translated": "Please yes translate",
                 "uvalue_style_translation": {
                     True: "Please yes translate",
                     False: "No please leave me alone",
                 },
                 "triggers_rerun": True,
             },
             "list_of_things": {
                 "style": "test_style_1",
                 "ukey": "list_of_things",
                 "ukey_translated": "list_of_things",
                 "default_value": [True, True, True],
                 "default_value_translated": [True, True, True],
                 "uvalue_style_translation": {},
                 "triggers_rerun": True,
             },
         }
     )
Пример #6
0
def main():
    '''
    Download all resources from our webpage

    '''
    uc = ursgal.UController()
    zip_files_list, update_kb_list =uc.prepare_resources(
        root_zip_target_folder='/tmp'
    )
    print()
    print('<<<Summary>>>')
    if len(zip_files_list) == 0:
        print('[ INFO ] All files are correctly stored in online repository')
    else:
        for zip_file, md5 in zip_files_list:
            print(
                '[ INFO ] File: {0} was created with md5: {1}'.format(
                    zip_file,
                    md5
                )
            )
            print()
    print()
    if len(update_kb_list) == 0:
        print('[ INFO ] No kb information has to be updated')
    else:
        for engine, message in update_kb_list:
            print(
                '[ INFO ] Please update kb for {0}'.format(
                    engine,
                )
            )
            print(message)
            print()

    return
Пример #7
0
def main():
    '''
    Examples script for filtering validated results for a PEP <= 0.01 and
    remove all decoys.

    usage:
        ./filter_csv_validation_example.py


    Will produce a file with only target sequences with a posterior error
    probability of lower or equal to 1 percent
    '''
    params = {
        'csv_filter_rules': [['PEP', 'lte', 0.01],
                             ['Is decoy', 'equals', 'false']]
    }

    csv_file_to_filter = os.path.join(
        os.pardir, 'example_data', 'misc',
        'filter_csv_validation_example_omssa_2_1_9_unified_percolator_2_08_validated.csv'
    )
    uc = ursgal.UController(params=params)

    filtered_csv = uc.filter_csv(input_file=csv_file_to_filter, )
def main(class_version):
    '''

    Example script to demonstrate speed and memory efficiency of the new
    upeptide_mapper.

    All tryptic peptides (n=1,094,395, 6 < len(peptide) < 40 ) are mapped to the 
    Chlamydomonas reinhardtii (38876 entries) target-decoy database.

    usage:
        ./complete_chlamydomonas_proteome_match.py <class_version>

    Class versions
        * UPeptideMapper_v2
        * UPeptideMapper_v3
        * UPeptideMapper_v4

    '''

    input_params = {
        'database':
        os.path.join(
            os.pardir, 'example_data',
            'Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta'
        ),
        'http_url':
        'https://www.sas.upenn.edu/~sschulze/Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta',
        'http_output_folder':
        os.path.join(
            os.pardir,
            'example_data',
        )
    }

    uc = ursgal.UController(params=input_params)

    if os.path.exists(input_params['database']) is False:
        uc.fetch_file(engine='get_http_files_1_0_0')
    print('Parsing fasta and digesting sequences')
    peptides = set()
    digest_start = time.time()
    for fastaID, sequence in ursgal.ucore.parse_fasta(
            open(input_params['database'], 'r')):
        tryptic_peptides = ursgal.ucore.digest(sequence, ('KR', 'C'),
                                               no_missed_cleavages=True)
        for p in tryptic_peptides:
            if 6 <= len(p) <= 40:
                peptides.add(p)
    print('Parsing fasta and digesting sequences took {0:1.2f} seconds'.format(
        time.time() - digest_start))
    if sys.platform == 'win32':
        print(
            '[ WARNING ] pyahocorasick can not be installed via pip on Windwows at the moment\n'
            '[ WARNING ] Falling back to UpeptideMapper_v2')
        class_version = 'UPeptideMapper_v2'

    upapa_class = uc.unodes['upeptide_mapper_1_0_0'][
        'class'].import_engine_as_python_function(class_version)

    print('Buffering fasta and mapping {0} peptides'.format(len(peptides)))
    map_start = time.time()

    if class_version == 'UPeptideMapper_v2':
        peptide_mapper = upapa_class(word_len=6)
        fasta_lookup_name = peptide_mapper.build_lookup_from_file(
            input_params['database'],
            force=False,
        )
        args = [list(peptides), fasta_lookup_name]
    elif class_version == 'UPeptideMapper_v3':
        peptide_mapper = upapa_class(input_params['database'])
        fasta_lookup_name = peptide_mapper.fasta_name
        args = [list(peptides), fasta_lookup_name]
    elif class_version == 'UPeptideMapper_v4':
        peptide_mapper = upapa_class(input_params['database'])
        args = [list(peptides)]

    p2p_mappings = peptide_mapper.map_peptides(*args)
    print('Buffering fasta and mapping {0} peptides took {1:1.2f} seconds'.
          format(len(peptides),
                 time.time() - map_start))
    if len(p2p_mappings.keys()) == len(peptides):
        print('All peptides have been mapped!')
    else:
        print('WARNING: Not all peptide have been mapped')
def main(folder=None, enzyme=None, target_decoy_database=None):
    '''
    '''
    # # define folder with mzML_files as sys.argv[1]
    mzML_files = []
    offset_files = []
    for sample in offsets.keys():
        for mzml in glob.glob(os.path.join(folder, sample, '*.mzML')):
            mzML_files.append(mzml)
        for offset_file in offsets[sample].keys():
            offset_files.append(offset_file)
    for mzml in mzML_files:
        if os.path.basename(mzml) not in offset_files:
            print(
                'mzML file in folder but NOT in offset dict: {}'.format(mzml))
            exit()

    mass_spectrometer = 'QExactive+'
    search_engines = [
        'xtandem_vengeance',
        'msfragger_20190222',
        'msgfplus_v2019_04_18',
    ]

    validation_engine = 'percolator_3_4_0'

    params = {
        'database':
        target_decoy_database,
        'enzyme':
        enzyme,
        'csv_filter_rules': [
            ['Is decoy', 'equals', 'false'],
            ['PEP', 'lte', 0.01],
            ['Conflicting uparam', 'contains_not', 'enzyme'],
        ],
        'precursor_mass_tolerance_minus':
        8,
        'precursor_mass_tolerance_plus':
        8,
        'frag_mass_tolerance':
        0.4,
        'frag_mass_tolerance_unit':
        'da',
        'rounded_mass_decimals':
        2,
        '-xmx':
        '32g',
        'peptide_mapper_class_version':
        'UPeptideMapper_v4',
        'use_pyqms_for_mz_calculation':
        True,
        'semi_enzyme':
        True,
        'precursor_min_charge':
        1,
        'precursor_max_charge':
        5,
        'percolator_post_processing':
        'mix-max',
        'psm_defining_colnames': [
            'Spectrum Title',
            'Sequence',
            'Modifications',
            'Charge',
            'Is decoy',
        ],
    }

    uc = ursgal.UController(profile=mass_spectrometer, params=params)

    all_result_files = []
    semi_result_files = []
    full_result_files = []
    for n, sample in enumerate(offsets.keys()):
        all_validated_result_files = []
        semi_validated_result_files = []
        full_validated_result_files = []
        combined_pep_result_files = []
        for search_engine in search_engines:
            results = []
            for spec_file in offsets[sample].keys():
                offset = offsets[sample][spec_file]
                if offset == 'skip':
                    continue
                uc.params['machine_offset_in_ppm'] = offset
                dirname = folder
                mzml_file = os.path.join(dirname, spec_file)
                mgf_file = uc.convert(
                    input_file=mzml_file,
                    engine='mzml2mgf_2_0_0',
                )
                uc.params['modifications'] = [
                    'C,fix,any,Carbamidomethyl',
                    'M,opt,any,Oxidation',
                    '*,opt,Prot-N-term,Acetyl',
                ]

                search_result = uc.search_mgf(
                    input_file=mgf_file,
                    engine=search_engine,
                )
                uc.params['prefix'] = ''

                converted_result = uc.convert(
                    input_file=search_result,
                    guess_engine=True,
                )

                mapped_results = uc.execute_misc_engine(
                    input_file=converted_result,
                    engine='upeptide_mapper',
                )

                unified_search_results = uc.execute_misc_engine(
                    input_file=mapped_results,
                    engine='unify_csv'
                    # force = True,
                )

            results_one_engine = uc.execute_misc_engine(
                input_file=results,
                engine='merge_csvs',
                # merge_duplicates=True,
                # force=True,
            )

            all_validated_csv = uc.validate(
                input_file=results_one_engine,
                engine=validation_engine,
            )
            all_validated_result_files.append(all_validated_csv)

            uc.params.update({
                'csv_filter_rules': [
                    ['Enzyme Specificity', 'contains_not', 'full'],
                ],
                'prefix':
                'Semi',
            })
            semi_filtered_csv = uc.execute_misc_engine(
                input_file=results_one_engine,
                engine='filter_csv',
            )
            semi_validated_csv = uc.validate(
                input_file=semi_filtered_csv,
                engine=validation_engine,
            )
            semi_validated_result_files.append(semi_validated_csv)

            uc.params.update({
                'csv_filter_rules': [
                    ['Enzyme Specificity', 'contains', 'full'],
                ],
                'prefix':
                'Full',
            })
            full_filtered_csv = uc.execute_misc_engine(
                input_file=results_one_engine,
                engine='filter_csv',
            )
            full_validated_csv = uc.validate(
                input_file=full_filtered_csv,
                engine=validation_engine,
            )
            full_validated_result_files.append(full_validated_csv)
            uc.params.update({
                'csv_filter_rules': [
                    ['Is decoy', 'equals', 'false'],
                    ['PEP', 'lte', 0.01],
                    ['Conflicting uparam', 'contains_not', 'enzyme'],
                ],
                'prefix':
                '',
            })

        all_combined_results = uc.combine_search_results(
            input_files=all_validated_result_files,
            engine='combine_pep_1_0_0',
        )
        semi_combined_results = uc.combine_search_results(
            input_files=semi_validated_result_files,
            engine='combine_pep_1_0_0',
        )
        full_combined_results = uc.combine_search_results(
            input_files=full_validated_result_files,
            engine='combine_pep_1_0_0',
        )

        uc.params['csv_filter_rules'] = [
            # ['Is decoy', 'equals', 'false'],
            ['combined PEP', 'lte', 0.01],
            ['Conflicting uparam', 'contains_not', 'enzyme'],
        ]
        all_filtered_combined_results = uc.execute_misc_engine(
            input_file=all_combined_results,
            engine='filter_csv',
        )
        all_result_files.append(all_filtered_combined_results)

        semi_filtered_combined_results = uc.execute_misc_engine(
            input_file=semi_combined_results,
            engine='filter_csv',
        )
        semi_result_files.append(semi_filtered_combined_results)

        full_filtered_combined_results = uc.execute_misc_engine(
            input_file=full_combined_results,
            engine='filter_csv',
        )
        full_result_files.append(full_filtered_combined_results)

    for l in [all_result_files, semi_result_files, full_result_files]:
        uc.params.update({
            'psm_defining_colnames': [
                'Spectrum Title',
                'Sequence',
                'Modifications',
                'Charge',
                'Is decoy',
            ],
        })
        all_files = uc.execute_misc_engine(
            input_file=l,
            engine='merge_csvs',
            merge_duplicates=True,
        )

        uc.params.update({
            'validation_score_field':
            'combined PEP',
            'bigger_scores_better':
            False,
            'num_compared_psms':
            10,
            'accept_conflicting_psms':
            False,
            'threshold_is_log10':
            True,
            'score_diff_threshold':
            1,
            'psm_defining_colnames': [
                'Spectrum Title',
                'Sequence',
            ],
        })

        sanitized_combined_results = uc.execute_misc_engine(
            input_file=all_files,
            engine='sanitize_csv',
        )
Пример #10
0
def unify_csv(file, engine):
    uc = ursgal.UController(params=params, profile="QExactive+", verbose=False)
    uc.scan_rt_lookup_path = "tests/data/_ursgal_test_lookup.pkl"
    uc.map_mods()
    unify_csv_main = uc.unodes["unify_csv_1_0_0"][
        "class"].import_engine_as_python_function()

    output_csv = os.path.join("tests", "data",
                              os.path.splitext(file)[0] + "_unified.csv")
    input_csv = os.path.join("tests", "data", file)
    scan_rt_lookup = pickle.load(
        open(os.path.join("tests", "data", "_ursgal_test_pickle.pkl"), "rb"))
    unify_csv_main(
        input_file=input_csv,
        output_file=output_csv,
        scan_rt_lookup=scan_rt_lookup,
        params={
            "translations": {
                "decoy_tag": "decoy_",
                "enzyme": "KR;C;P",
                "semi_enzyme": False,
                "database": os.path.join("tests", "data", "P0ADZ4.fasta"),
                "protein_delimiter": "<|>",
                "psm_merge_delimiter": ";",
                "keep_asp_pro_broken_peps": True,
                "precursor_mass_tolerance_minus": 5,
                "precursor_mass_tolerance_plus": 5,
                "precursor_isotope_range": "0,1",
                "max_missed_cleavages": 2,
                "rounded_mass_decimals": 3,
                "use_pyqms_for_mz_calculation": False,
                "aa_exception_dict": {
                    "J": {
                        "original_aa": ["L", "I"],
                    },
                    "O": {
                        "original_aa": ["K"],
                        "unimod_name": "Methylpyrroline",
                    },
                },
            },
            "label":
            "",
            "mods":
            uc.params["mods"],
            "prefix":
            "",
            "psm_defining_colnames": [
                "Spectrum Title",
                "Sequence",
                "Modifications",
                "Mass Difference",
                "Charge",
                "Is decoy",
            ],
        },
        search_engine=engine,
    )
    reader_produced = [line for line in csv.DictReader(open(output_csv))]
    reader_expected = [
        line for line in csv.DictReader(open(output_csv + "_expected.csv"))
    ]
    for pos, line in enumerate(reader_produced):
        print("#{pos:0>5} Produced: {mod}".format(pos=pos,
                                                  mod=line["Modifications"]))
        print("#{pos:0>5} Expected: {mod}".format(
            pos=pos, mod=reader_expected[pos]["Modifications"]))
        assert line["Modifications"] == reader_expected[pos]["Modifications"]
def main(folder=None, enzyme=None, target_decoy_database=None):
    '''
    Workflow for the analysis a dataset with multiple runs per sample.
    Usage:
        python <script_name.py> <folder_with_mzML> <enzyme> <path_to_database>
    '''
    # define folder with mzML_files as sys.argv[1]
    mzML_files = []
    offset_files = []
    for sample in offsets.keys():
        for mzml in glob.glob(os.path.join(folder, sample, '*.mzML')):
            mzML_files.append(mzml)
        for offset_file in offsets[sample].keys():
            offset_files.append(offset_file)
    for mzml in mzML_files:
        if os.path.basename(mzml) not in offset_files:
            print(
                'mzML file in folder but NOT in offset dict: {0}'.format(mzml))
            exit()

    mass_spectrometer = 'QExactive+'
    search_engines = [
        'xtandem_vengeance',
        'msfragger_20190222',
        'msgfplus_v2019_04_18',
    ]

    validation_engine = 'percolator_3_4_0'

    params = {
        'database':
        target_decoy_database,
        'enzyme':
        enzyme,
        'precursor_mass_tolerance_minus':
        10,
        'precursor_mass_tolerance_plus':
        10,
        'frag_mass_tolerance':
        10,
        'frag_mass_tolerance_unit':
        'ppm',
        'rounded_mass_decimals':
        2,
        '-xmx':
        '32g',
        'peptide_mapper_class_version':
        'UPeptideMapper_v4',
        'use_pyqms_for_mz_calculation':
        True,
        'percolator_post_processing':
        'mix-max',
        'psm_defining_colnames': [
            'Spectrum Title',
            'Sequence',
            'Modifications',
            'Charge',
            'Is decoy',
        ],
        'max_missed_cleavages':
        3,
    }

    uc = ursgal.UController(profile=mass_spectrometer, params=params)

    all_result_files = []
    for n, sample in enumerate(offsets.keys()):
        validated_result_files = []
        combined_pep_result_files = []
        for search_engine in search_engines:
            results = []
            for spec_file in offsets[sample].keys():
                basename = spec_file
                dirname = os.path.join(folder)
                offset = offsets[sample][basename]
                spec_file_path = os.path.join(dirname, basename)
                if offset == 'skip':
                    continue
                uc.params['machine_offset_in_ppm'] = offset
                mgf_file = uc.convert(
                    input_file=spec_file_path,
                    engine='mzml2mgf_2_0_0',
                )

                uc.params['modifications'] = [
                    'C,fix,any,Carbamidomethyl',
                    'M,opt,any,Oxidation',
                    '*,opt,Prot-N-term,Acetyl',
                ]
                search_result = uc.search_mgf(
                    input_file=mgf_file,
                    engine=search_engine,
                )

                converted_result = uc.convert(
                    input_file=search_result,
                    guess_engine=True,
                )

                mapped_results = uc.execute_misc_engine(
                    input_file=converted_result,
                    engine='upeptide_mapper',
                )

                unified_search_results = uc.execute_misc_engine(
                    input_file=mapped_results, engine='unify_csv')

                results.append(unified_search_results)

                # validated_single_csv = uc.validate(
                #     input_file  = unified_search_results,
                #     engine      = validation_engine,
                # )
                #
                # uc.params['csv_filter_rules'] = [
                #     # ['Is decoy', 'equals', 'false'],
                #     ['combined PEP','lte', 0.01],
                #     ['Conflicting uparam', 'contains_not', 'enzyme'],
                # ]
                # filtered_combined_results = uc.execute_misc_engine(
                #     input_file = validated_single_csv,
                #     engine='filter_csv',
                # )

            uc.params['prefix'] = sample
            results_one_engine = uc.execute_misc_engine(
                input_file=results,
                engine='merge_csvs',
                # merge_duplicates=True,
            )
            uc.params['prefix'] = ''

            validated_csv = uc.validate(
                input_file=results_one_engine,
                engine=validation_engine,
            )
            # filtered_combined_results = uc.execute_misc_engine(
            #         input_file = validated_csv,
            #         engine='filter_csv',
            #     )

            validated_result_files.append(validated_csv)

        combined_results = uc.combine_search_results(
            input_files=validated_result_files,
            engine='combine_pep_1_0_0',
        )

        uc.params['csv_filter_rules'] = [
            ['combined PEP', 'lte', 0.01],
            ['Conflicting uparam', 'contains_not', 'enzyme'],
        ]
        filtered_combined_results = uc.execute_misc_engine(
            input_file=combined_results,
            engine='filter_csv',
        )
        all_result_files.append(filtered_combined_results)

    results_all_files = uc.execute_misc_engine(
        input_file=all_result_files,
        engine='merge_csvs',
        merge_duplicates=True,
    )

    uc.params.update({
        'validation_score_field': 'combined PEP',
        'bigger_scores_better': False,
        'num_compared_psms': 10,
        'accept_conflicting_psms': False,
        'threshold_is_log10': True,
        'score_diff_threshold': 1,
        'psm_defining_colnames': [
            'Spectrum Title',
            'Sequence',
        ],
    })

    sanitized_combined_results = uc.execute_misc_engine(
        input_file=results_all_files,
        engine='sanitize_csv',
    )
Пример #12
0
def main():
    '''
    Example script to do a simple machine ppm offset parameter sweep.
    The m/z values in the example mgf file are stepwise changed and the in the
    final output the total peptides are counted.

    usage:
        ./bsa_ppm_offset_test.py

    Note:
        As expected, if the offset becomes to big no peptides can be found anymore.

    '''
    ppm_offsets = [
        (-10, '-10_ppm_offset'),
        (-9, '-9_ppm_offset'),
        (-8, '-8_ppm_offset'),
        (-7, '-7_ppm_offset'),
        (-6, '-6_ppm_offset'),
        (-5, '-5_ppm_offset'),
        (-4, '-4_ppm_offset'),
        (-3, '-3_ppm_offset'),
        (-2, '-2_ppm_offset'),
        (-1, '-1_ppm_offset'),
        (None, '0_ppm_offset'),
        (1, '1_ppm_offset'),
        (2, '2_ppm_offset'),
        (3, '3_ppm_offset'),
        (4, '4_ppm_offset'),
        (5, '5_ppm_offset'),
        (6, '6_ppm_offset'),
        (7, '7_ppm_offset'),
        (8, '8_ppm_offset'),
        (9, '9_ppm_offset'),
        (10, '10_ppm_offset'),
    ]

    engine_list = ['xtandem_vengeance']

    R = ursgal.UController(
        profile='LTQ XL low res',
        params={
            'database':
            os.path.join(os.pardir, 'example_data', 'BSA.fasta'),
            'modifications': [
                'M,opt,any,Oxidation',  # Met oxidation
                'C,fix,any,Carbamidomethyl',  # Carbamidomethylation
                '*,opt,Prot-N-term,Acetyl'  # N-Acteylation
            ],
        })

    mzML_file = os.path.join(os.pardir, 'example_data',
                             'BSA_machine_ppm_offset_example', 'BSA1.mzML')
    if os.path.exists(mzML_file) is False:
        R.params[
            'http_url'] = 'http://sourceforge.net/p/open-ms/code/HEAD/tree/OpenMS/share/OpenMS/examples/BSA/BSA1.mzML?format=raw'
        R.params['http_output_folder'] = os.path.dirname(mzML_file)
        R.fetch_file(engine='get_http_files_1_0_0')
        try:
            shutil.move('{0}?format=raw'.format(mzML_file), mzML_file)
        except:
            shutil.move('{0}format=raw'.format(mzML_file), mzML_file)

    for engine in engine_list:
        for (ppm_offset, prefix) in ppm_offsets:

            R.params['machine_offset_in_ppm'] = ppm_offset
            R.params['prefix'] = prefix

            unified_search_result_file = R.search(
                input_file=mzML_file,
                engine=engine,
                force=False,
            )

    collector = ddict(set)
    for csv_path in glob.glob('{0}/*/*unified.csv'.format(
            os.path.dirname(mzML_file))):
        for line_dict in csv.DictReader(open(csv_path, 'r')):
            collector[csv_path].add(line_dict['Sequence'])
    for csv_path, peptide_set in sorted(collector.items()):
        file_name = os.path.basename(csv_path)
        offset = file_name.split('_')[0]
        print('Search with {0: >3} ppm offset found {1: >2} peptides'.format(
            offset, len(peptide_set)))

    return
Пример #13
0
#!/usr/bin/env python3.4
# encoding: utf-8
'''

Test the unify_csv function for msgfplus engine

'''
import ursgal
import csv
import pickle
import os


R = ursgal.UController()

scan_rt_lookup = pickle.load(
    open(
        os.path.join(
            'tests',
            'data',
            '_test_ursgal_lookup.pkl')
        ,
        'rb'
    )
)

unify_csv_main = R.unodes['unify_csv_1_0_0']['class'].import_engine_as_python_function()
input_csv = os.path.join(
    'tests',
    'data',
    'novor_1_1beta',
Пример #14
0
def main(dirpath, skip_old=False, num_specs=1):
    uc = ursgal.UController()
    uc.params.update(
        {
            "bigger_scores_better": False,
            "num_compared_psms": 10,
            "accept_conflicting_psms": False,
            "threshold_is_log10": True,
            "score_diff_threshold": 1,
            "psm_defining_colnames": [
                "Spectrum Title",
                "Sequence",
            ],
        }
    )
    pkl_name = os.path.join(dirpath, "datasets_result.pkl")
    fdr_pkl_name = os.path.join(dirpath, "fdr_result.pkl")
    old_exists = False
    if os.path.exists(pkl_name) and skip_old is True:
        # load results from previous analysis
        # will only add datasets that are not part of it already
        print(">>>>>>>> loading pkl <<<<<<<<<<<")
        results_dict = pickle.load(open(pkl_name, "rb"))
        fdr_dict = pickle.load(open(fdr_pkl_name, "rb"))
        old_exists = True
    else:
        # collect proteins and peptides from result csv,
        # store in dict with all important data
        results_dict = {
            "all": {
                "num_spectra": 0,
                "instrument": set(),
                "lab": set(),
                # protein_groups, proteins and peptides are dicts that contain sets for each level of confidence
                "protein_groups": {
                    "all": set(),
                    "safe_psm": set(),
                    "safe_seq": set(),
                    "safe_seq_num_spec": set(),
                    "safe_seq_num_spec_0005": set(),
                },
                "proteins": {
                    "all": set(),
                    "safe_psm": set(),
                    "safe_seq": set(),
                    "safe_seq_num_spec": set(),
                    "safe_seq_num_spec_0005": set(),
                },
                "peptides": {"all": set(), "safe": set(), "safe_num_specs": set()},
                "spectra": {"all": set()},
                # protein_dict in contrast is a nested dict with protein/protein_group --> peptide sequence --> spectral information
                # (containing lists of 'spec_title', 'bayes_pep', modifications', 'charge', 'psm_q_value', 'start_stop')
                "protein_dict": {},
            }
        }
        fdr_dict = {
            "peptides_seq_level": {},
            "peptides_psm_level": {},
            "peptides_seq_level_2specs": {},
            "glycopeptides_psm_level": {},
            "glycopeptides_seq_level": {},
            "glycopeptides_seq_level_2specs": {},
            "proteins_seq_level": {},
            "proteins_psm_level": {},
            "proteins_seq_level_2specs": {},
        }

    result_file_list = []
    for PRIDE_ID in datasets.keys():
        if skip_old is True and old_exists is True and PRIDE_ID in results_dict:
            continue
        print("reading:", PRIDE_ID)
        instrument = datasets[PRIDE_ID]["instrument"]
        results_dict["all"]["instrument"].add(instrument)
        lab = datasets[PRIDE_ID]["lab"]
        results_dict["all"]["lab"].add(lab)
        results_dict["all"]["num_spectra"] += datasets[PRIDE_ID]["num_spectra"]
        if PRIDE_ID not in results_dict.keys():
            results_dict[PRIDE_ID] = {
                "num_spectra": datasets[PRIDE_ID]["num_spectra"],
                "instrument": instrument,
                "lab": lab,
                "protein_groups": {
                    "all": set(),
                    "safe_psm": set(),
                    "safe_seq": set(),
                    "safe_seq_num_spec": set(),
                    "safe_seq_num_spec_0005": set(),
                },
                "proteins": {
                    "all": set(),
                    "safe_psm": set(),
                    "safe_seq": set(),
                    "safe_seq_num_spec": set(),
                    "safe_seq_num_spec_0005": set(),
                },
                "peptides": {"all": set(), "safe": set(), "safe_num_specs": set()},
                "spectra": {"all": set()},
                "protein_dict": {},
            }

        # collect proteins, peptides and corresponding spectrum_titles
        if datasets[PRIDE_ID]["folders"] != "":
            PRIDE_folder = os.path.join(PRIDE_ID, datasets[PRIDE_ID]["folders"])
        else:
            PRIDE_folder = PRIDE_ID
        merged_file = os.path.join(
            PRIDE_folder,
            datasets[PRIDE_ID]["result_file"],
        )
        result_file_list.append(merged_file)
        protein_ids = set()
        protein_groups = set()
        with open(merged_file, "r") as in_file:
            result_csv = csv.DictReader(in_file)
            for line_dict in result_csv:
                seq = line_dict["Sequence"]  # + line_dict['Modifications']
                mod = line_dict["Modifications"]
                mods = []
                # In contrast to the original ArcPP analysis, modifications are taken into account
                # except for optional modifications that depend on the sample preparation.
                # In the following, commented out sections indicate the use of "seq"
                # that has now been changed to "seq_mod".
                for m in line_dict["Modifications"].split(";"):
                    if "iTRAQ4plex" in m or "Label:" in m or "Oxidation" in m:
                        continue
                    mods.append(m)
                charge = line_dict["Charge"]
                # seq_mod = '{0}#{1}'.format(seq, mod)
                seq_mod = "{0}#{1}".format(seq, ";".join(mods))
                seq_length = len(seq)
                spec_title = line_dict["Spectrum Title"]
                sample = spec_title.split(".")[0]
                is_decoy = line_dict["Is decoy"]
                prot = line_dict["Protein ID"]
                start = line_dict["Sequence Start"]
                stop = line_dict["Sequence Stop"]
                pre = line_dict["Sequence Pre AA"]
                post = line_dict["Sequence Post AA"]
                psm_q_value = float(line_dict["combined PEP"])
                bayes_pep = float(line_dict["Bayes PEP"])
                if psm_q_value <= 0.01:
                    if seq_length not in fdr_dict["peptides_psm_level"].keys():
                        fdr_dict["peptides_psm_level"][seq_length] = {}
                    if seq not in fdr_dict["peptides_psm_level"][seq_length].keys():
                        fdr_dict["peptides_psm_level"][seq_length][seq] = (
                            psm_q_value,
                            is_decoy,
                        )
                    elif (
                        psm_q_value < fdr_dict["peptides_psm_level"][seq_length][seq][0]
                    ):
                        fdr_dict["peptides_psm_level"][seq_length][seq] = (
                            psm_q_value,
                            is_decoy,
                        )
                    if (
                        "Hex" in mod
                        and seq_length not in fdr_dict["glycopeptides_psm_level"].keys()
                    ):
                        fdr_dict["glycopeptides_psm_level"][seq_length] = {}
                    if (
                        "Hex" in mod
                        and seq_mod
                        not in fdr_dict["glycopeptides_psm_level"][seq_length].keys()
                    ):
                        fdr_dict["glycopeptides_psm_level"][seq_length][seq_mod] = (
                            psm_q_value,
                            is_decoy,
                        )
                    elif (
                        "Hex" in mod
                        and psm_q_value
                        < fdr_dict["glycopeptides_psm_level"][seq_length][seq_mod][0]
                    ):
                        fdr_dict["glycopeptides_psm_level"][seq_length][seq_mod] = (
                            psm_q_value,
                            is_decoy,
                        )
                else:
                    print(
                        "Results should be filtered by combined PEP <= 1% (but should contain targets and decoys)"
                    )
                    sys.exit(1)

                # differentiate between protein groups and proteins
                # and remove contaminants
                if len(prot.split("<|>")) > 1:
                    contaminants = True
                    for p in prot.split("<|>"):
                        prot_id = p.split(" ")[0]
                        if "HVO" not in prot_id:
                            continue
                        else:
                            contaminants = False
                    # contaminants = False
                    if contaminants is False and is_decoy == "false":
                        results_dict[PRIDE_ID]["protein_groups"]["all"].add(
                            line_dict["Protein ID"]
                        )
                        results_dict[PRIDE_ID]["peptides"]["all"].add(seq_mod)
                        # results_dict[PRIDE_ID]['peptides']['all'].add(seq)
                        results_dict[PRIDE_ID]["spectra"]["all"].add(spec_title)
                else:
                    contaminants = False
                    prot_id = prot.split(" ")[0]
                    if "HVO" not in prot_id:
                        contaminants = True
                    if contaminants is False and is_decoy == "false":
                        results_dict[PRIDE_ID]["proteins"]["all"].add(
                            line_dict["Protein ID"]
                        )
                        results_dict[PRIDE_ID]["peptides"]["all"].add(seq_mod)
                        # results_dict[PRIDE_ID]['peptides']['all'].add(seq)
                        results_dict[PRIDE_ID]["spectra"]["all"].add(spec_title)

                # add info to protein_dict
                if prot not in results_dict[PRIDE_ID]["protein_dict"].keys():
                    results_dict[PRIDE_ID]["protein_dict"][prot] = {}
                # if seq not in results_dict[PRIDE_ID]['protein_dict'][prot].keys():
                #     results_dict[PRIDE_ID]['protein_dict'][prot][seq] = {
                if seq_mod not in results_dict[PRIDE_ID]["protein_dict"][prot].keys():
                    results_dict[PRIDE_ID]["protein_dict"][prot][seq_mod] = {
                        "spec_title": [],
                        "bayes_pep": [],
                        "modifications": [],
                        "charge": [],
                        "psm_q_value": [],
                        "start_stop": (start, stop, pre, post),
                    }
                results_dict[PRIDE_ID]["protein_dict"][prot][seq_mod][
                    "spec_title"
                ].append(spec_title)
                results_dict[PRIDE_ID]["protein_dict"][prot][seq_mod][
                    "bayes_pep"
                ].append(bayes_pep)
                results_dict[PRIDE_ID]["protein_dict"][prot][seq_mod][
                    "psm_q_value"
                ].append(psm_q_value)
                results_dict[PRIDE_ID]["protein_dict"][prot][seq_mod][
                    "modifications"
                ].append(mod)
                results_dict[PRIDE_ID]["protein_dict"][prot][seq_mod]["charge"].append(
                    charge
                )
                # results_dict[PRIDE_ID]['protein_dict'][prot][seq]['spec_title'].append(spec_title)
                # results_dict[PRIDE_ID]['protein_dict'][prot][seq]['bayes_pep'].append(bayes_pep)
                # results_dict[PRIDE_ID]['protein_dict'][prot][seq]['psm_q_value'].append(psm_q_value)
                # results_dict[PRIDE_ID]['protein_dict'][prot][seq]['modifications'].append(mod)
                # results_dict[PRIDE_ID]['protein_dict'][prot][seq]['charge'].append(charge)

        # merge identifications from each dataset into "all"
        for level in ["protein_groups", "proteins", "peptides", "spectra"]:
            results_dict["all"][level]["all"] |= results_dict[PRIDE_ID][level]["all"]
        for prot in results_dict[PRIDE_ID]["protein_dict"].keys():
            if prot not in results_dict["all"]["protein_dict"].keys():
                results_dict["all"]["protein_dict"][prot] = {"datasets": set()}
            results_dict["all"]["protein_dict"][prot]["datasets"].add(PRIDE_ID)
            for seq in results_dict[PRIDE_ID]["protein_dict"][prot].keys():
                start_stop = results_dict[PRIDE_ID]["protein_dict"][prot][seq][
                    "start_stop"
                ]
                if seq not in results_dict["all"]["protein_dict"][prot].keys():
                    results_dict["all"]["protein_dict"][prot][seq] = {
                        "spec_title": [],
                        "bayes_pep": [],
                        "modifications": [],
                        "charge": [],
                        "psm_q_value": [],
                        "start_stop": start_stop,
                    }
                for k, v in results_dict[PRIDE_ID]["protein_dict"][prot][seq].items():
                    if k == "start_stop":
                        continue
                    results_dict["all"]["protein_dict"][prot][seq][k].extend(v)

    # Calculate q-values
    # peptides first, then proteins
    for PRIDE_ID in results_dict.keys():
        # generate input dict for q_value calculation function
        seq_q_value_dict = {}
        for prot in results_dict[PRIDE_ID]["protein_dict"].keys():
            for seq_mod in results_dict[PRIDE_ID]["protein_dict"][prot].keys():
                if seq_mod == "datasets":
                    continue
                seq_length = len(seq_mod.split("#")[0])
                min_bayes_pep = min(
                    results_dict[PRIDE_ID]["protein_dict"][prot][seq_mod]["bayes_pep"]
                )
                if seq_length not in seq_q_value_dict.keys():
                    seq_q_value_dict[seq_length] = {}
                if "decoy_" in prot:
                    is_decoy = True
                else:
                    is_decoy = False
                seq_q_value_dict[seq_length][seq_mod] = {
                    "Bayes PEP": min_bayes_pep,
                    "Is decoy": is_decoy,
                }
            # for seq in results_dict[PRIDE_ID]['protein_dict'][prot].keys():
            #     if seq == 'datasets':
            #         continue
            #     seq_length = len(seq)
            #     min_bayes_pep = min(
            #         results_dict[PRIDE_ID]['protein_dict'][prot][seq]['bayes_pep']
            #     )
            #     if seq_length not in seq_q_value_dict.keys():
            #         seq_q_value_dict[seq_length] = {}
            #     if 'decoy_' in prot:
            #         is_decoy = True
            #     else:
            #         is_decoy = False
            #     seq_q_value_dict[seq_length][seq] = {
            #         'Bayes PEP' : min_bayes_pep,
            #         'Is decoy' : is_decoy,
            #     }

        print("calculating q-values on peptide level")
        seq_calc_q_value_dict = calculate_q_value_by_group(
            seq_q_value_dict, sliding=False
        )

        # read results from peptide q_value calc, at the same time
        # generate input dict for proteins for q_value calculation function
        prot_q_value_dict = {"seq_level": {}, "psm_level": {}}
        for prot in results_dict[PRIDE_ID]["protein_dict"].keys():
            contaminants = False
            prot_id = prot.split(" ")[0]
            if "HVO" not in prot_id:
                contaminants = True
            if "decoy_" in prot:
                is_decoy = True
            else:
                is_decoy = False
            for seq in results_dict[PRIDE_ID]["protein_dict"][prot].keys():
                if seq == "datasets":
                    continue
                # seq_length = len(seq)
                seq_length = len(seq.split("#")[0])
                seq_q_value = seq_calc_q_value_dict[seq_length][seq]["combined PEP"]
                results_dict[PRIDE_ID]["protein_dict"][prot][seq][
                    "seq_q_value"
                ] = seq_q_value

                if seq_q_value <= SEQ_Q_VALUE_THRESHOLD:
                    if PRIDE_ID == "all":
                        if "Hex" in seq:
                            if (
                                seq_length
                                not in fdr_dict["glycopeptides_seq_level"].keys()
                            ):
                                fdr_dict["glycopeptides_seq_level"][seq_length] = {}
                            fdr_dict["glycopeptides_seq_level"][seq_length][seq] = (
                                seq_q_value,
                                is_decoy,
                            )
                        else:
                            if seq_length not in fdr_dict["peptides_seq_level"].keys():
                                fdr_dict["peptides_seq_level"][seq_length] = {}
                            fdr_dict["peptides_seq_level"][seq_length][seq] = (
                                seq_q_value,
                                is_decoy,
                            )
                    counts = len(
                        set(
                            results_dict[PRIDE_ID]["protein_dict"][prot][seq][
                                "spec_title"
                            ]
                        )
                    )
                    if is_decoy is False and contaminants is False:
                        results_dict[PRIDE_ID]["peptides"]["safe"].add(seq)
                        if counts >= num_specs:
                            results_dict[PRIDE_ID]["peptides"]["safe_num_specs"].add(seq)
                            if PRIDE_ID == "all":
                                if "Hex" in seq:
                                    if (
                                        seq_length
                                        not in fdr_dict[
                                            "glycopeptides_seq_level_2specs"
                                        ].keys()
                                    ):
                                        fdr_dict["glycopeptides_seq_level_2specs"][
                                            seq_length
                                        ] = {}
                                    fdr_dict["glycopeptides_seq_level_2specs"][
                                        seq_length
                                    ][seq] = (seq_q_value, is_decoy)
                                else:
                                    if (
                                        seq_length
                                        not in fdr_dict[
                                            "peptides_seq_level_2specs"
                                        ].keys()
                                    ):
                                        fdr_dict["peptides_seq_level_2specs"][
                                            seq_length
                                        ] = {}
                                    fdr_dict["peptides_seq_level_2specs"][seq_length][
                                        seq
                                    ] = (seq_q_value, is_decoy)
                    min_bayes_pep = min(
                        results_dict[PRIDE_ID]["protein_dict"][prot][seq]["bayes_pep"]
                    )
                    if min_bayes_pep == 0.0:
                        min_bayes_pep = np.nextafter(0, 1)
                    log_seq_bayes = math.log10(min_bayes_pep)
                    if prot not in prot_q_value_dict["seq_level"].keys():
                        prot_q_value_dict["seq_level"][prot] = {
                            "Bayes PEP": log_seq_bayes,
                            "Is decoy": is_decoy,
                        }
                    else:
                        prot_q_value_dict["seq_level"][prot][
                            "Bayes PEP"
                        ] += log_seq_bayes

                for bayes_pep in results_dict[PRIDE_ID]["protein_dict"][prot][seq][
                    "bayes_pep"
                ]:
                    if bayes_pep == 0.0:
                        bayes_pep = np.nextafter(0, 1)
                    log_psm_bayes = math.log10(bayes_pep)
                    if prot not in prot_q_value_dict["psm_level"].keys():
                        prot_q_value_dict["psm_level"][prot] = {
                            "Bayes PEP": log_seq_bayes,
                            "Is decoy": is_decoy,
                        }
                    else:
                        prot_q_value_dict["psm_level"][prot][
                            "Bayes PEP"
                        ] += log_seq_bayes

        print("calculating q-values on protein level")
        prot_calc_q_value_dict = calculate_q_value_by_group(
            prot_q_value_dict, sliding=False, picked_fdr=True
        )

        # read results from protein q_value calc
        for prot in results_dict[PRIDE_ID]["protein_dict"].keys():
            contaminants = False
            prot_id = prot.split(" ")[0]
            if "HVO" not in prot_id:
                contaminants = True
            if "decoy_" in prot:
                is_decoy = True
            else:
                is_decoy = False
            for level in ["psm_level", "seq_level"]:
                if prot in prot_calc_q_value_dict[level].keys():
                    prot_q_value = prot_calc_q_value_dict[level][prot]["combined PEP"]
                    prot_bayes_pep = prot_calc_q_value_dict[level][prot]["Bayes PEP"]
                else:
                    prot_q_value = 1
                    prot_bayes_pep = 1
                # count number of spectra for each prot (for seq FDR > 1%)
                # collect samples for simple protein inference model
                counts = 0
                samples = set()
                for seq in results_dict[PRIDE_ID]["protein_dict"][prot].keys():
                    if seq in [
                        "datasets",
                        "prot_q_value_seq",
                        "prot_q_value_psm",
                        "samples",
                    ]:
                        continue
                    if (
                        results_dict[PRIDE_ID]["protein_dict"][prot][seq]["seq_q_value"]
                        > 0.01
                    ):
                        continue
                    psm_set = set(
                        results_dict[PRIDE_ID]["protein_dict"][prot][seq]["spec_title"]
                    )
                    counts += len(psm_set)
                    for psm in psm_set:
                        ms_filename = ".".join(psm.split(".")[:-3])
                        samples.add(ms_filename2sample.get(ms_filename, ms_filename))

                if PRIDE_ID == "all":
                    if level == "seq_level":
                        fdr_dict["proteins_seq_level"][prot] = (prot_bayes_pep, is_decoy)
                        if counts >= num_specs:
                            fdr_dict["proteins_seq_level_2specs"][prot] = (
                                prot_bayes_pep,
                                is_decoy,
                            )
                    else:
                        fdr_dict["proteins_psm_level"][prot] = (prot_bayes_pep, is_decoy)
                if prot_q_value <= 0.01 and is_decoy is False and contaminants is False:
                    if level == "seq_level":
                        if len(prot.split("<|>")) > 1:
                            results_dict[PRIDE_ID]["protein_groups"]["safe_seq"].add(
                                prot
                            )
                            if counts >= num_specs:
                                results_dict[PRIDE_ID]["protein_groups"][
                                    "safe_seq_num_spec"
                                ].add(prot)
                                if prot_q_value <= PROT_Q_VALUE_THRESHOLD:
                                    results_dict[PRIDE_ID]["protein_groups"][
                                        "safe_seq_num_spec_0005"
                                    ].add(prot)
                        else:
                            results_dict[PRIDE_ID]["proteins"]["safe_seq"].add(prot)
                            if counts >= num_specs:
                                results_dict[PRIDE_ID]["proteins"][
                                    "safe_seq_num_spec"
                                ].add(prot)
                                if prot_q_value <= PROT_Q_VALUE_THRESHOLD:
                                    results_dict[PRIDE_ID]["proteins"][
                                        "safe_seq_num_spec_0005"
                                    ].add(prot)
                    elif counts >= num_specs:
                        if len(prot.split("<|>")) > 1:
                            results_dict[PRIDE_ID]["protein_groups"]["safe_psm"].add(
                                prot
                            )
                        else:
                            results_dict[PRIDE_ID]["proteins"]["safe_psm"].add(prot)
                if level == "seq_level":
                    results_dict[PRIDE_ID]["protein_dict"][prot][
                        "prot_q_value_seq"
                    ] = prot_q_value
                else:
                    results_dict[PRIDE_ID]["protein_dict"][prot][
                        "prot_q_value_psm"
                    ] = prot_q_value
                results_dict[PRIDE_ID]["protein_dict"][prot]["samples"] = samples
        print(
            "Number of confident protein identifications for {0}: {1}".format(
                PRIDE_ID,
                len(results_dict[PRIDE_ID]["proteins"]["safe_seq_num_spec_0005"]),
            )
        )

    # save results in a pkl
    pickle.dump(results_dict, open(pkl_name, "wb"))
    print("pickled results: ", pkl_name)

    pickle.dump(fdr_dict, open(fdr_pkl_name, "wb"))
    print("pickled fdr_dict: ", fdr_pkl_name)
Пример #15
0
def main(fasta_database, class_version):
    '''

    Example script to demonstrate speed and memory efficiency of the new 
    upeptide_mapper.
    
    Specify fasta_database and class_version as input.

    usage:
        ./complete_proteome_match.py <fasta_database> <class_version>
    
    Class versions
        * UPeptideMapper_v2
        * UPeptideMapper_v3
        * UPeptideMapper_v4

    '''

    input_params = {
        'database': sys.argv[1],
    }

    uc = ursgal.UController(params=input_params)

    print('Parsing fasta and digesting sequences')
    peptides = set()
    digest_start = time.time()
    for fastaID, sequence in ursgal.ucore.parseFasta(
            open(input_params['database'], 'r')):
        tryptic_peptides = ursgal.ucore.digest(
            sequence,
            ('KR', 'C'),
            # no_missed_cleavages = True
        )
        for p in tryptic_peptides:
            if 6 <= len(p) <= 40:
                peptides.add(p)
    print('Parsing fasta and digesting sequences took {0:1.2f} seconds'.format(
        time.time() - digest_start))

    if sys.platform == 'win32':
        print(
            '[ WARNING ] pyahocorasick can not be installed via pip on Windwows at the moment\n'
            '[ WARNING ] Falling back to UpeptideMapper_v2')
        class_version = 'UPeptideMapper_v2'

    upapa_class = uc.unodes['upeptide_mapper_1_0_0'][
        'class'].import_engine_as_python_function(class_version)

    print('Buffering fasta and mapping {0} peptides'.format(len(peptides)))
    map_start = time.time()

    if class_version == 'UPeptideMapper_v2':
        peptide_mapper = upapa_class(word_len=6)
        fasta_lookup_name = peptide_mapper.build_lookup_from_file(
            input_params['database'],
            force=False,
        )
        args = [list(peptides), fasta_lookup_name]
    elif class_version == 'UPeptideMapper_v3':
        peptide_mapper = upapa_class(input_params['database'])
        fasta_lookup_name = peptide_mapper.fasta_name
        args = [list(peptides), fasta_lookup_name]
    elif class_version == 'UPeptideMapper_v4':
        peptide_mapper = upapa_class(input_params['database'])
        args = [list(peptides)]
    p2p_mappings = peptide_mapper.map_peptides(*args)
    print('Buffering fasta and mapping {0} peptides took {1:1.2f} seconds'.
          format(len(peptides),
                 time.time() - map_start))
    if len(p2p_mappings.keys()) == len(peptides):
        print('All peptides have been mapped!')
    else:
        print('WARNING: Not all peptide have been mapped')
def main(folder):
    '''
    Executes a search with different versions of msgf+ m on an example file from the
    data from Bruderer et al.

    usage:
        ./msgf_plus_version_comparison_qexactive.py <folder containing B_D140314_SGSDSsample1_R01_MSG_T0.mzML.gz>


    Creates a Venn diagram with the peptides obtained by the different versions.


    '''

    required_example_file = 'B_D140314_SGSDSsample1_R01_MSG_T0.mzML.gz'
    full_path = os.path.join(folder, required_example_file)

    if os.path.exists(full_path) is False:

        print('''
            Your specified folder does not contain the required example file:
            {0}
            The RAW data from peptideatlas.org (PASS00589, password: WF6554orn)
            will be downloaded.
            Please convert to mzML after the download has finished and run this
            script again.
            '''.format(required_example_file))

        ftp_get_params = {
            'ftp_url': 'ftp.peptideatlas.org',
            'ftp_login': '******',
            'ftp_password': '******',
            'ftp_include_ext':
            [required_example_file.replace('.mzML', '.raw')],
            'ftp_output_folder': folder,
        }
        uc = ursgal.UController(params=ftp_get_params)
        uc.fetch_file(engine='get_ftp_files_1_0_0')
        sys.exit(1)

    engine_list = [
        'msgfplus_v9979',
        'msgfplus_v2016_09_16',
    ]

    params = {
        'database':
        os.path.join(os.pardir, 'example_data',
                     'hs_201303_qs_sip_target_decoy.fasta'),
        'modifications': ['C,fix,any,Carbamidomethyl'],
        'csv_filter_rules': [['PEP', 'lte', 0.01],
                             ['Is decoy', 'equals', 'false']],
        'http_url':
        'http://www.uni-muenster.de/Biologie.IBBP.AGFufezan/misc/hs_201303_qs_sip_target_decoy.fasta',
        'http_output_folder':
        os.path.join(os.pardir, 'example_data'),
        'machine_offset_in_ppm':
        -5e-6,
        'remove_temporary_files':
        False
    }

    uc = ursgal.UController(profile='QExactive+', params=params)

    if os.path.exists(params['database']) is False:
        uc.fetch_file(engine='get_http_files_1_0_0')

    mzML_file = os.path.join(folder, required_example_file)

    filtered_files_list = []
    for engine in engine_list:

        unified_result_file = uc.search(
            input_file=mzML_file,
            engine=engine,
            force=False,
        )

        validated_file = uc.validate(
            input_file=unified_result_file,
            engine='percolator_2_08',
        )

        filtered_file = uc.execute_misc_engine(
            input_file=validated_file,
            engine='filter_csv_1_0_0',
        )

        filtered_files_list.append(filtered_file)

    uc.visualize(
        input_files=filtered_files_list,
        engine='venndiagram',
    )
    return
def main():
    """

    Example script to compare UPeptideMapper v3 vs v4 results.


    usage:
        ./validate_upeptide_mapper_v3_vs_v4.py

    """

    input_params = {
        "database":
        os.path.join(
            os.pardir,
            "example_data",
            "Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta",
        ),
        "http_url":
        "https://www.sas.upenn.edu/~sschulze/Creinhardtii_281_v5_5_CP_MT_with_contaminants_target_decoy.fasta",
        "http_output_folder":
        os.path.join(
            os.pardir,
            "example_data",
        ),
    }

    uc = ursgal.UController(params=input_params)

    if os.path.exists(input_params["database"]) is False:
        uc.fetch_file(engine="get_http_files_1_0_0")
    print("Parsing fasta and digesting sequences")
    peptides = set()
    max_number_peptides = 1000000000
    digest_start = time.time()
    for fastaID, sequence in ursgal.ucore.parse_fasta(
            open(input_params["database"], "r")):
        tryptic_peptides = ursgal.ucore.digest(sequence, ("KR", "C"),
                                               no_missed_cleavages=True)
        for p in tryptic_peptides:
            if 6 <= len(p) <= 40:
                if len(peptides) > max_number_peptides:
                    break
                peptides.add(p)
    print("Parsing fasta and digesting sequences took {0:1.2f} seconds".format(
        time.time() - digest_start))

    # print(peptides)
    upapa_class = uc.unodes["upeptide_mapper_1_0_0"][
        "class"].import_engine_as_python_function("UPeptideMapper_v3")
    print("Buffering fasta and mapping {0} peptides with v3".format(
        len(peptides)))
    peptide_mapper = upapa_class(input_params["database"])
    fasta_lookup_name = peptide_mapper.fasta_name
    args = [list(peptides), fasta_lookup_name]
    start_time = time.time()
    v3_p2p_mappings = peptide_mapper.map_peptides(*args)
    print("UPeptideMapper v3 mapper took {0}s".format(time.time() -
                                                      start_time))
    print("Done")
    v3_p2p_mappings = copy.deepcopy(v3_p2p_mappings)

    upapa_class = uc.unodes["upeptide_mapper_1_0_0"][
        "class"].import_engine_as_python_function("UPeptideMapper_v4")
    print("Buffering fasta and mapping {0} peptides with v4".format(
        len(peptides)))
    peptide_mapper = upapa_class(input_params["database"])
    args = [
        list(peptides),
    ]
    start_time = time.time()
    v4_p2p_mappings = peptide_mapper.map_peptides(*args)
    print("UPeptideMapper v4 mapper took {0}s".format(time.time() -
                                                      start_time))
    print("Done")

    assert len(v3_p2p_mappings.keys()) == len(v4_p2p_mappings.keys())
    assert list(sorted(v3_p2p_mappings.keys())) == list(
        sorted(v4_p2p_mappings.keys()))
    compare_keys = [
        "start",
        "end",
        "pre",
        "post",
        "id",
    ]
    num_peps = len(v3_p2p_mappings.keys())
    for ppos, peptide in enumerate(list(sorted(v3_p2p_mappings.keys()))):
        v3_maps = sorted([(d["id"], d["start"], d)
                          for d in v3_p2p_mappings[peptide]])
        v4_maps = sorted([(d["id"], d["start"], d)
                          for d in v4_p2p_mappings[peptide]])
        print("Comparing peptide #{0}/{1}".format(ppos, num_peps), end="\r")
        assert len(v3_maps) == len(v4_maps)

        for pos, (v3_id, v3_start, v3_map_dict) in enumerate(v3_maps):
            v4_id, v4_start, v4_map_dict = v4_maps[pos]
            for key in compare_keys:
                assert v3_map_dict[key] == v4_map_dict[key]
def main(folder):
    '''

    usage:

        ./human_br_complete_workflow.py <folder_with_human_br_files>

    This scripts produces the data for figure 3.

    '''

    # Initialize the UController:
    uc = ursgal.UController(params={
        'enzyme': 'trypsin',
        'decoy_generation_mode': 'reverse_protein',
    })

    # MS Spectra, downloaded from http://proteomecentral.proteomexchange.org
    # via the dataset accession PXD000263 and converted to mzML

    mass_spec_files = [
        '120813OTc1_NQL-AU-0314-LFQ-LCM-SG-01_013.mzML',
        '120813OTc1_NQL-AU-0314-LFQ-LCM-SG-02_025.mzML',
        '120813OTc1_NQL-AU-0314-LFQ-LCM-SG-03_033.mzML',
        '120813OTc1_NQL-AU-0314-LFQ-LCM-SG-04_048.mzML',
    ]

    for mass_spec_file in mass_spec_files:
        if os.path.exists(os.path.join(folder, mass_spec_file)) is False:
            print(
                'Please download RAW files to folder {} and convert to mzML:'.
                format(folder))
            pprint.pprint(mass_spec_files)
            sys.exit(1)

    # mods from Wen et al. (2015):
    modifications = [
        # Carbamidomethyl  (C) was set as fixed modification
        'C,fix,any,Carbamidomethyl',
        'M,opt,any,Oxidation',  # Oxidation (M) as well as
        # Deamidated (NQ) were set as optional modification
        'N,opt,any,Deamidated',
        # Deamidated (NQ) were set as optional modification
        'Q,opt,any,Deamidated',
    ]

    # The target peptide database which will be searched (UniProt Human
    # reference proteome from July 2013)
    target_database = 'uniprot_human_UP000005640_created_until_20130707.fasta'
    # Let's turn it into a target decoy database by reversing peptides:
    target_decoy_database = uc.execute_misc_engine(
        input_file=target_database, engine='generate_target_decoy_1_0_0')

    # OMSSA parameters from Wen et al. (2015):
    omssa_params = {
        # (used by default) # -w
        'he': '1000',  # -he 1000
        'zcc': '1',  # -zcc 1
        'frag_mass_tolerance': '0.6',  # -to 0.6
        'frag_mass_tolerance_unit': 'da',  # -to 0.6
        'precursor_mass_tolerance_minus': '10',  # -te 10
        'precursor_mass_tolerance_plus': '10',  # -te 10
        'precursor_mass_tolerance_unit': 'ppm',  # -teppm
        'score_a_ions': False,  # -i 1,4
        'score_b_ions': True,  # -i 1,4
        'score_c_ions': False,  # -i 1,4
        'score_x_ions': False,  # -i 1,4
        'score_y_ions': True,  # -i 1,4
        'score_z_ions': False,  # -i 1,4
        'enzyme': 'trypsin_p',  # -e 10
        'maximum_missed_cleavages': '1',  # -v 1
        'precursor_max_charge': '8',  # -zh 8
        'precursor_min_charge': '1',  # -zl 1
        'tez': '1',  # -tez 1
        'precursor_isotope_range': '0,1',  # -ti 1
        'num_match_spec': '1',  # -hc 1
        'database': target_decoy_database,
        'modifications': modifications,
    }

    # MS-GF+ parameters from Wen et al. (2015):
    msgf_params = {
        # precursor ion mass tolerance was set to 10 ppm
        'precursor_mass_tolerance_unit': 'ppm',
        # precursor ion mass tolerance was set to 10 ppm
        'precursor_mass_tolerance_minus': '10',
        # precursor ion mass tolerance was set to 10 ppm
        'precursor_mass_tolerance_plus': '10',
        # the max number of optional modifications per peptide were set as 3
        # (used by default) # number of allowed isotope errors was set as 1
        'enzyme': 'trypsin',  # the enzyme was set as trypsin
        # (used by default) # fully enzymatic peptides were specified, i.e. no non-enzymatic termini
        'frag_method':
        '1',  # the fragmentation method selected in the search was CID
        'max_pep_length':
        '45',  # the maximum peptide length to consider was set as 45
        # the minimum precursor charge to consider if charges are not specified
        # in the spectrum file was set as 1
        'precursor_min_charge': '1',
        # the maximum precursor charge to consider was set as 8
        'precursor_max_charge': '8',
        # (used by default) # the parameter 'addFeatures' was set as 1 (required for Percolator)
        # all of the other parameters were set as default
        # the instrument selected
        # was High-res
        'database': target_decoy_database,
        'modifications': modifications,
    }

    # X!Tandem parameters from Wen et al. (2015):
    xtandem_params = {
        # precursor ion mass tolerance was set to 10 ppm
        'precursor_mass_tolerance_unit': 'ppm',
        # precursor ion mass tolerance was set to 10 ppm
        'precursor_mass_tolerance_minus': '10',
        # precursor ion mass tolerance was set to 10 ppm
        'precursor_mass_tolerance_plus': '10',
        # the fragment ion mass tolerance was set to 0.6 Da
        'frag_mass_tolerance': '0.6',
        # the fragment ion mass tolerance was set to 0.6 Da
        'frag_mass_tolerance_unit': 'da',
        # parent monoisotopic mass isotope error was set as 'yes'
        'precursor_isotope_range': '0,1',
        'precursor_max_charge':
        '8',  # maximum parent charge of spectrum was set as 8
        'enzyme': 'trypsin',  # the enzyme was set as trypsin ([RK]|[X])
        # the maximum missed cleavage sites were set as 1
        'maximum_missed_cleavages': '1',
        # (used by default) # no model refinement was employed.
        'database': target_decoy_database,
        'modifications': modifications,
    }

    search_engine_settings = [
        # not used in Wen et al., so we use the same settings as xtandem
        ('msamanda_1_0_0_5243', xtandem_params, 'LTQ XL high res'),
        # not used in Wen et al., so we use the same settings as xtandem
        ('myrimatch_2_1_138', xtandem_params, 'LTQ XL high res'),
        # the instrument selected was High-res
        ('msgfplus_v9979', msgf_params, 'LTQ XL high res'),
        ('xtandem_jackhammer', xtandem_params, None),
        ('omssa_2_1_9', omssa_params, None),
    ]

    merged_validated_files_3_engines = []
    merged_validated_files_5_engines = []

    for engine, wen_params, instrument in search_engine_settings:

        # Initializing the uPLANIT UController class with
        # our specified modifications and mass spectrometer
        uc = ursgal.UController(params=wen_params)

        if instrument is not None:
            uc.set_profile(instrument)

        unified_results = []
        percolator_validated_results = []

        for mzML_file in mass_spec_files:
            unified_search_results = uc.search(
                input_file=mzML_file,
                engine=engine,
            )
            unified_results.append(unified_search_results)
            validated_csv = uc.validate(
                input_file=unified_search_results,
                engine='percolator_2_08',
            )
            percolator_validated_results.append(validated_csv)

        merged_validated_csv = uc.execute_misc_engine(
            input_file=percolator_validated_results, engine='merge_csvs_1_0_0')
        merged_unvalidated_csv = uc.execute_misc_engine(
            input_file=unified_results,
            engine='merge_csvs_1_0_0',
        )

        if engine in ["omssa_2_1_9", "xtandem_jackhammer", "msgfplus_v9979"]:
            merged_validated_files_3_engines.append(merged_validated_csv)
        merged_validated_files_5_engines.append(merged_validated_csv)

    uc.params['prefix'] = '5-engines-summary'
    uc.combine_search_results(
        input_files=merged_validated_files_5_engines,
        engine='combine_FDR_0_1',
    )

    uc.params['prefix'] = '3-engines-summary'
    uc.combine_search_results(
        input_files=merged_validated_files_3_engines,
        engine='combine_FDR_0_1',
    )
def main(folder):
    """
    Executes a search with 5 versions of X!Tandem on an example file from the
    data from Bruderer et al. 2015.

    usage:
        ./xtandem_version_comparison.py <folder containing B_D140314_SGSDSsample1_R01_MSG_T0.mzML>


    This is a simple example file to show the straightforward comparison of
    different program versions of X!Tandem, similar to the example script
    'xtandem_version_comparison', but analyzing high resolution data
    which can be better handled by version newer than Jackhammer. One gains
    approximately 10 percent more peptides with newer versions of X!Tandem.

    Creates a Venn diagram with the peptides obtained by the different versions.


    """

    required_example_file = "B_D140314_SGSDSsample1_R01_MSG_T0.mzML"

    if os.path.exists(os.path.join(folder, required_example_file)) is False:
        print("""
            Your specified folder does not contain the required example file:
            {0}
            The RAW data from peptideatlas.org (PASS00589, password: WF6554orn) 
            will be downloaded. 
            Please convert to mzML after the download has finished and run this
            script again.
            """.format(required_example_file))

        ftp_get_params = {
            "ftp_url": "ftp.peptideatlas.org",
            "ftp_login": "******",
            "ftp_password": "******",
            "ftp_include_ext":
            [required_example_file.replace(".mzML", ".raw")],
            "ftp_output_folder": folder,
        }
        uc = ursgal.UController(params=ftp_get_params)
        uc.fetch_file(engine="get_ftp_files_1_0_0")
        sys.exit(1)

    engine_list = [
        "xtandem_cyclone",
        "xtandem_jackhammer",
        "xtandem_sledgehammer",
        "xtandem_piledriver",
        "xtandem_vengeance",
    ]

    params = {
        "database":
        os.path.join(os.pardir, "example_data",
                     "hs_201303_qs_sip_target_decoy.fasta"),
        "modifications": ["C,fix,any,Carbamidomethyl"],
        "csv_filter_rules": [["PEP", "lte", 0.01],
                             ["Is decoy", "equals", "false"]],
        "http_url":
        "http://www.uni-muenster.de/Biologie.IBBP.AGFufezan/misc/hs_201303_qs_sip_target_decoy.fasta",
        "http_output_folder":
        os.path.join(os.pardir, "example_data"),
        "machine_offset_in_ppm":
        -5e-6,
    }

    uc = ursgal.UController(profile="QExactive+", params=params)

    if os.path.exists(params["database"]) is False:
        uc.fetch_file(engine="get_http_files_1_0_0")

    mzML_file = os.path.join(folder, required_example_file)

    filtered_files_list = []
    for engine in engine_list:

        unified_result_file = uc.search(
            input_file=mzML_file,
            engine=engine,
            force=False,
        )

        validated_file = uc.validate(
            input_file=unified_result_file,
            engine="percolator_2_08",
        )

        filtered_file = uc.execute_misc_engine(
            input_file=validated_file,
            engine="filter_csv_1_0_0",
        )

        filtered_files_list.append(filtered_file)

    uc.visualize(
        input_files=filtered_files_list,
        engine="venndiagram_1_1_0",
    )
    return
def search(validation_engine):
    '''
    Executes a grouped search on four example files from the 
    data from Barth et al.

    usage:
        ./grouped_search_example.py

    Searches for peptides including the following potential modifications: 
    oxidation of M,
    deamidation of N/Q,
    methylation of E/K/R,
    N-terminal acetylation,
    phosphorylation of S/T.

    After the search, each type of modification is validated seperately. 
    '''
    # Initializing the ursgal UController class with
    # our specified modifications and mass spectrometer
    uc = ursgal.UController(
        profile=mass_spectrometer,  # 'LTQ XL low res' profile!
        params=params)

    # complete workflow:
    # every spectrum file is searched with every search engine,
    # results are seperated into groups and validated seperately,
    # validated results are merged and filtered for targets and PEP <= 0.01.
    # In the end, all filtered results from all spectrum files are merged
    # for validation_engine in validation_engines:
    result_files = []
    for n, spec_file in enumerate(spec_files):
        validated_results = []
        for search_engine in search_engines:
            unified_search_results = uc.search(
                input_file=spec_file,
                engine=search_engine,
            )

            # Calculate PEP for every group seperately, therefore need to split the csv first
            group_list = sorted(groups.keys())
            for p, group in enumerate(group_list):
                if group == '0':
                    uc.params['csv_filter_rules'] = [
                        [
                            'Modifications', 'contains_not',
                            '{0}'.format(groups['1'])
                        ],
                        [
                            'Modifications', 'contains_not',
                            '{0}'.format(groups['2'])
                        ],
                        [
                            'Modifications', 'contains_not',
                            '{0}'.format(groups['3'])
                        ],
                        [
                            'Modifications', 'contains_not',
                            '{0}'.format(groups['4'])
                        ],
                        [
                            'Modifications', 'contains_not',
                            '{0}'.format(groups['5'])
                        ],
                    ]
                else:
                    uc.params['csv_filter_rules'] = [[
                        'Modifications', 'contains',
                        '{0}'.format(groups[group])
                    ]]
                    for other_group in group_list:
                        if other_group == '0' or other_group == group:
                            continue
                        uc.params['csv_filter_rules'].append([
                            'Modifications', 'contains_not', '{0}'.format(
                                groups[other_group])
                        ], )
                uc.params['prefix'] = 'grouped-{0}'.format(group)
                filtered_results = uc.filter_csv(
                    input_file=unified_search_results, )
                uc.params['prefix'] = ''
                validated_search_results = uc.validate(
                    input_file=filtered_results,
                    engine=validation_engine,
                )
                validated_results.append(validated_search_results)

        uc.params['prefix'] = 'file{0}'.format(n)
        validated_results_from_all_engines = uc.merge_csvs(
            input_files=sorted(validated_results), )
        uc.params['prefix'] = ''
        uc.params['csv_filter_rules'] = [
            ['Is decoy', 'equals', 'false'],
            ['PEP', 'lte', 0.01],
        ]
        filtered_validated_results = uc.filter_csv(
            input_file=validated_results_from_all_engines)
        result_files.append(filtered_validated_results)

    results_all_files = uc.merge_csvs(input_files=sorted(result_files), )
    return results_all_files
Пример #21
0
def main(input_file=None, arcpp_pep_file=None):
    """
    Post-process glycopeptide identifications in order to ensure
    that glycopeptide identifications are substantiated by at least
    two PSMs and two replicates.
    Furthermore, split results into N-glycans, non-canonical N-glycans and O-glycans.

    Usage:
        python <script_name.py> <path_to_glycopeptide_PSMs> <path_to_ArcPP_peptides_file>
    """
    params = {
        "psm_defining_colnames": [
            "Spectrum Title",
            "Sequence",
            "Modifications",
            "Charge",
            "Is decoy",
        ],
    }
    uc = ursgal.UController(params=params)

    arcpp_glycopeps = {}
    with open(arcpp_pep_file, "r") as arcpp_in:
        arcpp_csv = csv.DictReader(arcpp_in)
        for line_dict in arcpp_csv:
            glycopep, glycan_type = line_to_pep_unimod_glyc(line_dict)
            if glycopep not in arcpp_glycopeps.keys():
                arcpp_glycopeps[glycopep] = []
            arcpp_glycopeps[glycopep].append(line_dict)

    replicate_lookup = {}
    for pride in PRIDE_ids:
        replicate_lookup[pride] = {}
        file_description = os.path.join(
            "file_descriptions", "{0}_file_descriptions.csv".format(pride)
        )
        with open(file_description, "r") as descr_in:
            descr_csv = csv.DictReader(descr_in)
            for line_dict in descr_csv:
                strain = line_dict["Strain"]
                file_name = line_dict["Raw file name"].split(".")[0]
                replicate = line_dict["Replicate"]
                replicate_lookup[pride][file_name] = {
                    "strain": strain,
                    "rep": replicate,
                }

    true_n_glycopeps = ddict(dict)
    non_standard_n_glycopeps = ddict(dict)
    o_glycopeps = ddict(dict)
    all_strain = set()
    with open(input_file, "r") as glyco_in:
        glyco_csv = csv.DictReader(glyco_in)
        fieldnames = glyco_csv.fieldnames
        for line_dict in glyco_csv:
            protein = line_dict["Protein ID"]
            if protein.startswith("sp|"):
                continue
            # peptide = line_dict['Sequence']
            spec_id = line_dict["Spectrum Title"]
            file_name = line_dict["Spectrum Title"].split(".")[0]
            dataset = line_dict["Dataset"]
            strain = replicate_lookup[dataset][file_name]["strain"]
            rep = replicate_lookup[dataset][file_name]["rep"]
            glycopep, glycan_type = line_to_pep_unimod_glyc(line_dict)
            if strain not in true_n_glycopeps.keys():
                true_n_glycopeps[strain] = {}
                all_strain.add(strain)
            if "n_glycan" in glycan_type:
                if glycopep not in true_n_glycopeps[strain].keys():
                    true_n_glycopeps[strain][glycopep] = {
                        "frag_ions": set(),
                        "specs": set(),
                        "reps": set(),
                        "line_dicts": [],
                    }
                true_n_glycopeps[strain][glycopep]["specs"].add(spec_id)
                true_n_glycopeps[strain][glycopep]["reps"].add("#".join([dataset, rep]))
                true_n_glycopeps[strain][glycopep]["line_dicts"].append(line_dict)
                true_n_glycopeps[strain][glycopep]["frag_ions"].add(
                    line_dict["MS2 Glycopep Frag Ions Present"]
                )
            elif "o_glycan" in glycan_type and len(glycan_type) == 1:
                if strain not in o_glycopeps.keys():
                    o_glycopeps[strain] = {}
                    all_strain.add(strain)
                if glycopep not in o_glycopeps[strain].keys():
                    o_glycopeps[strain][glycopep] = {
                        "frag_ions": set(),
                        "specs": set(),
                        "reps": set(),
                        "line_dicts": [],
                    }
                o_glycopeps[strain][glycopep]["specs"].add(spec_id)
                o_glycopeps[strain][glycopep]["reps"].add("#".join([dataset, rep]))
                o_glycopeps[strain][glycopep]["line_dicts"].append(line_dict)
                o_glycopeps[strain][glycopep]["frag_ions"].add(
                    line_dict["MS2 Glycopep Frag Ions Present"]
                )
            elif "true_non_standard_n_glycan" in glycan_type:
                if strain not in non_standard_n_glycopeps.keys():
                    non_standard_n_glycopeps[strain] = {}
                    all_strain.add(strain)
                if glycopep not in non_standard_n_glycopeps[strain].keys():
                    non_standard_n_glycopeps[strain][glycopep] = {
                        "frag_ions": set(),
                        "specs": set(),
                        "reps": set(),
                        "line_dicts": [],
                    }
                non_standard_n_glycopeps[strain][glycopep]["specs"].add(spec_id)
                non_standard_n_glycopeps[strain][glycopep]["reps"].add(
                    "#".join([dataset, rep])
                )
                non_standard_n_glycopeps[strain][glycopep]["line_dicts"].append(
                    line_dict
                )
                non_standard_n_glycopeps[strain][glycopep]["frag_ions"].add(
                    line_dict["MS2 Glycopep Frag Ions Present"]
                )
            else:
                print(glycan_type)

    count_true_n_glycopeps = set()
    count_true_n_glycopeps_arcpp = set()
    count_o_glycopeps = set()
    count_o_glycopeps_arcpp = set()
    count_non_standard_glycopeps = set()
    count_non_standard_glycopeps_arcpp = set()
    output_line_dicts_n = []
    output_line_dicts_non_standard_n = []
    output_line_dicts_o = []
    for strain in all_strain:
        print(strain)
        for glycopep in true_n_glycopeps[strain].keys():
            if "True" not in true_n_glycopeps[strain][glycopep]["frag_ions"]:
                continue
            if len(true_n_glycopeps[strain][glycopep]["specs"]) < 2:
                continue
            if len(true_n_glycopeps[strain][glycopep]["reps"]) < 2:
                continue
            count_true_n_glycopeps.add(glycopep)
            if glycopep not in arcpp_glycopeps.keys():
                continue
            count_true_n_glycopeps_arcpp.add(glycopep)
            output_line_dicts_n.extend(true_n_glycopeps[strain][glycopep]["line_dicts"])

        for glycopep in o_glycopeps[strain].keys():
            if "True" not in o_glycopeps[strain][glycopep]["frag_ions"]:
                continue
            if len(o_glycopeps[strain][glycopep]["specs"]) < 2:
                continue
            if len(o_glycopeps[strain][glycopep]["reps"]) < 2:
                continue
            count_o_glycopeps.add(glycopep)
            if glycopep not in arcpp_glycopeps.keys():
                continue
            count_o_glycopeps_arcpp.add(glycopep)
            output_line_dicts_o.extend(o_glycopeps[strain][glycopep]["line_dicts"])

        for glycopep in non_standard_n_glycopeps[strain].keys():
            if "True" not in non_standard_n_glycopeps[strain][glycopep]["frag_ions"]:
                continue
            if len(non_standard_n_glycopeps[strain][glycopep]["specs"]) < 2:
                continue
            if len(non_standard_n_glycopeps[strain][glycopep]["reps"]) < 2:
                continue
            count_non_standard_glycopeps.add(glycopep)
            if glycopep not in arcpp_glycopeps.keys():
                continue
            count_non_standard_glycopeps_arcpp.add(glycopep)
            output_line_dicts_non_standard_n.extend(
                non_standard_n_glycopeps[strain][glycopep]["line_dicts"]
            )

    print(
        """
        True N-glyco: {0}
        True N-glyco ArcPP: {1}

        O-glyco: {2}
        O-glyco ArcPP: {3}

        Non-standard N-glyco: {4}
        Non-standard N-glyco ArcPP: {5}
    """.format(
            len(count_true_n_glycopeps),
            len(count_true_n_glycopeps_arcpp),
            len(count_o_glycopeps),
            len(count_o_glycopeps_arcpp),
            len(count_non_standard_glycopeps),
            len(count_non_standard_glycopeps_arcpp),
        )
    )

    csv_kwargs = {}
    if sys.platform == "win32":
        csv_kwargs["lineterminator"] = "\n"
    else:
        csv_kwargs["lineterminator"] = "\r\n"
    csv_out_name = "ArcPP_N_glyco_filtered_peptides_2rep.csv"
    with open(csv_out_name, "w") as csv_out:
        csv_writer = csv.DictWriter(csv_out, fieldnames=fieldnames, **csv_kwargs)
        csv_writer.writeheader()
        for out_dict in output_line_dicts_n:
            csv_writer.writerow(out_dict)

    csv_out_name = "ArcPP_only_O_glyco_filtered_peptides_2rep.csv"
    with open(csv_out_name, "w") as csv_out:
        csv_writer = csv.DictWriter(csv_out, fieldnames=fieldnames, **csv_kwargs)
        csv_writer.writeheader()
        for out_dict in output_line_dicts_o:
            csv_writer.writerow(out_dict)

    csv_out_name = "ArcPP_only_non_canonical_n_glyco_filtered_peptides_2rep.csv"
    with open(csv_out_name, "w") as csv_out:
        csv_writer = csv.DictWriter(csv_out, fieldnames=fieldnames, **csv_kwargs)
        csv_writer.writeheader()
        for out_dict in output_line_dicts_non_standard_n:
            csv_writer.writerow(out_dict)
Пример #22
0
 def setUp(self):
     self.upapa_5 = umama.UPeptideMapper(word_len=5)
     self.upapa_5.build_lookup(fasta_name='Test.fasta',
                               fasta_stream=TEST_FASTA)
     self.uc = ursgal.UController(verbose=False)
Пример #23
0
def main():
    """
    Executes a search with OMSSA, XTandem and MS-GF+ on the BSA1.mzML
    input_file

    usage:
        ./simple_example_search.py

    Note:
        Myrimatch does not work with this file.
        To use MSAmanda on unix platforms, please install mono
        (http://www.mono-project.com/download)

    """
    uc = ursgal.UController(
        profile="LTQ XL low res",
        params={
            "database": os.path.join(os.pardir, "example_data", "BSA.fasta"),
            "modifications": [
                "M,opt,any,Oxidation",  # Met oxidation
                "C,fix,any,Carbamidomethyl",  # Carbamidomethylation
                "*,opt,Prot-N-term,Acetyl",  # N-Acteylation
            ],
            # 'peptide_mapper_class_version' : 'UPeptideMapper_v2',
        },
    )

    if sys.maxsize > 2 ** 32:
        xtandem = "xtandem_vengeance"
    else:
        xtandem = "xtandem_sledgehammer"

    engine_list = [
        "omssa",
        xtandem,
        "msgfplus_v2016_09_16",
    ]

    mzML_file = os.path.join(
        os.pardir, "example_data", "BSA_simple_example_search", "BSA1.mzML"
    )
    if os.path.exists(mzML_file) is False:
        uc.params[
            "http_url"
        ] = "http://sourceforge.net/p/open-ms/code/HEAD/tree/OpenMS/share/OpenMS/examples/BSA/BSA1.mzML?format=raw"
        uc.params["http_output_folder"] = os.path.dirname(mzML_file)
        uc.fetch_file(
            engine="get_http_files_1_0_0",
        )
        try:
            shutil.move("{0}?format=raw".format(mzML_file), mzML_file)
        except:
            shutil.move("{0}format=raw".format(mzML_file), mzML_file)

    unified_file_list = []

    for engine in engine_list:
        unified_search_result_file = uc.search(
            input_file=mzML_file, engine=engine, force=False
        )
        unified_file_list.append(unified_search_result_file)

    uc.visualize(
        input_files=unified_file_list,
        engine="venndiagram_1_1_0",
    )
    return
def main(folder=None, database=None, enzyme=None):
    """
    Example workflow to perform a open modification search with three independent search engines
    across all mzML files of a given folder and to statistically post-process and combine the
    results of all searches.

    Usage:
        ./open_modification_search_incl_combined_pep.py <mzML_folder> <database> <enzyme>
    """
    # For this particular dataset, two enzymes were used, namely gluc and trypsin.
    mzml_files = []
    for mzml in glob.glob(os.path.join(folder, "*.mzML")):
        mzml_files.append(mzml)

    mass_spectrometer = "QExactive+"
    validation_engine = "percolator_3_4_0"
    search_engines = ["msfragger_2_3", "pipi_1_4_6", "moda_v1_61"]

    params = {
        "modifications": ["C,fix,any,Carbamidomethyl"],
        "csv_filter_rules": [
            ["Is decoy", "equals", "false"],
            ["PEP", "lte", 0.01],
        ],
        "frag_mass_tolerance_unit":
        "ppm",
        "frag_mass_tolerance":
        20,
        "precursor_mass_tolerance_unit":
        "ppm",
        "precursor_mass_tolerance_plus":
        5,
        "precursor_mass_tolerance_minus":
        5,
        "moda_high_res":
        False,
        "max_mod_size":
        4000,
        "min_mod_size":
        -200,
        "precursor_true_units":
        "ppm",
        "precursor_true_tolerance":
        5,
        "percolator_post_processing":
        "mix-max",
        "psm_defining_colnames": [
            "Spectrum Title",
            "Sequence",
            "Modifications",
            "Charge",
            "Is decoy",
            "Mass Difference",
        ],
        "database":
        database,
        "enzyme":
        enzyme,
    }

    uc = ursgal.UController(
        profile=mass_spectrometer,
        params=params,
    )

    # This will hold input to combined PEP engine
    combined_pep_input = defaultdict(list)

    # This dictionary will help organize which results to merge
    all_merged_results = defaultdict(list)

    for search_engine in search_engines:

        # The modification size for MSFragger is configured through precursor mass tolerance
        if search_engine == "msfragger_2_3":
            uc.params.update({
                "precursor_mass_tolerance_unit": "da",
                "precursor_mass_tolerance_plus": 4000,
                "precursor_mass_tolerance_minus": 200,
            })

        for n, spec_file in enumerate(mzml_files):
            # 1. convert to MGF
            mgf_file = uc.convert(
                input_file=spec_file,
                engine="mzml2mgf_2_0_0",
            )

            # 2. do the actual search
            raw_search_results = uc.search_mgf(
                input_file=mgf_file,
                engine=search_engine,
            )

            # reset precursor mass tolerance just in case it was previously changed
            uc.params.update({
                "precursor_mass_tolerance_unit": "ppm",
                "precursor_mass_tolerance_plus": 5,
                "precursor_mass_tolerance_minus": 5,
            })

            # 3. convert files to csv
            csv_search_results = uc.convert(
                input_file=raw_search_results,
                engine=None,
                guess_engine=True,
            )

            # 4. protein mapping.
            mapped_csv_search_results = uc.execute_misc_engine(
                input_file=csv_search_results,
                engine="upeptide_mapper_1_0_0",
            )

            # 5. Convert csv to unified ursgal csv format:
            unified_search_results = uc.execute_misc_engine(
                input_file=mapped_csv_search_results,
                engine="unify_csv_1_0_0",
                merge_duplicates=False,
            )

            # 6. Validate the results
            validated_csv = uc.validate(
                input_file=unified_search_results,
                engine=validation_engine,
            )

            # save the validated input for combined pep
            # Eventually, each sample will have 3 files correpsonding to the 3 search engines
            combined_pep_input["sample_{0}".format(n)].append(validated_csv)

            filtered_validated_results = uc.execute_misc_engine(
                input_file=validated_csv,
                engine="filter_csv_1_0_0",
                merge_duplicates=False,
            )

            all_merged_results["percolator_only"].append(
                filtered_validated_results)

    # combined pep
    uc.params.update({
        "csv_filter_rules": [
            ["Is decoy", "equals", "false"],
            ["combined PEP", "lte", 0.01],
        ],
        "psm_defining_colnames": [
            "Spectrum Title",
            "Sequence",
            "Modifications",
            "Charge",
            "Is decoy",
        ],
    })
    for sample in combined_pep_input.keys():
        combine_results = uc.execute_misc_engine(
            input_file=combined_pep_input[sample],
            engine="combine_pep_1_0_0",
        )

        filtered_validated_results = uc.execute_misc_engine(
            input_file=combine_results,
            engine="filter_csv_1_0_0",
        )
        all_merged_results["combined_pep"].append(filtered_validated_results)

    # separately merge results from the two types of validation techniques
    # We also add back "Mass Difference" to columns defining a PSM to avoid merging mass differences
    uc.params.update({
        "psm_defining_colnames": [
            "Spectrum Title",
            "Sequence",
            "Modifications",
            "Charge",
            "Is decoy",
            "Mass Difference",
        ],
    })

    for validation_type in all_merged_results.keys():
        if validation_type == "percolator_only":
            uc.params["psm_colnames_to_merge_multiple_values"] = {
                "PEP": "min_value",
            }
        else:
            uc.params["psm_colnames_to_merge_multiple_values"] = {
                "combined PEP": "min_value",
                "Bayes PEP": "min_value",
            }

        uc.params["prefix"] = "All_{0}".format(
            validation_type)  # helps recognize files easily

        merged_results_one_rep = uc.execute_misc_engine(
            input_file=all_merged_results[validation_type],
            engine="merge_csvs_1_0_0",
            merge_duplicates=True,
        )
        uc.params["prefix"] = ""
def main(folder=None, enzyme=None, target_decoy_database=None):
    """
    Workflow for the analysis a dataset with one run per sample.
    Usage:
        python <script_name.py> <folder_with_mzML> <enzyme> <path_to_database>
    """
    # define folder with mzML_files as sys.argv[1]
    mzML_files = []
    for mzml in glob.glob(os.path.join(folder, "*.mzML")):
        mzML_files.append(os.path.basename(mzml))
    offset_files = []
    for sample in offsets.keys():
        for spec_file in offsets[sample].keys():
            offset_files.append(spec_file)
    for mzml in mzML_files:
        if mzml not in offset_files:
            print(
                "mzML file in folder but NOT in offset dict: {}".format(mzml))
            exit()
    for sample in offset_files:
        if sample not in mzML_files:
            print(
                "Sample in offset dict but mzML file NOT in folder: {}".format(
                    sample))
            exit()

    mass_spectrometer = "QExactive+"
    search_engines = [
        "xtandem_vengeance",
        "msfragger_2_3",
        "msgfplus_v2019_07_03",
    ]

    validation_engine = "percolator_3_4_0"

    params = {
        "database":
        target_decoy_database,
        "enzyme":
        enzyme,
        "precursor_mass_tolerance_minus":
        10,
        "precursor_mass_tolerance_plus":
        10,
        "frag_mass_tolerance":
        10,
        "frag_mass_tolerance_unit":
        "ppm",
        "rounded_mass_decimals":
        2,
        "-xmx":
        "32g",
        "peptide_mapper_class_version":
        "UPeptideMapper_v4",
        "use_pyqms_for_mz_calculation":
        True,
        "percolator_post_processing":
        "mix-max",
        "psm_defining_colnames": [
            "Spectrum Title",
            "Sequence",
            "Modifications",
            "Charge",
            "Is decoy",
        ],
        "max_missed_cleavages":
        2,
    }

    # glycans are defined as variable modifications
    # Hex and Hex(1)HexA(1) (=1427) are existing unimod modifications
    Hvo_Glyco = [
        "",
        "N,opt,any,Hex",
        "N,opt,any,1427",
        "N,opt,any,Hex(1)HexA(2),C18H26O17",
        "N,opt,any,Hex(1)HexA(3),C24H34O23",
        "N,opt,any,Hex(1)HexA(2)MeHexA(1)Hex(1),C31H46O28",
        "N,opt,any,Hex(1)HexA(2)MeHexA(1),C25H36O23",
        "N,opt,any,SO3Hex(1),C6H10O8S1",
        "N,opt,any,SO3Hex(1)Hex(1),C12H20O13S1",
        "N,opt,any,SO3Hex(1)Hex(2),C18H30O18S1",
        "N,opt,any,SO3Hex(1)Hex(2)dHex(1),C24H40O22S1",
    ]

    uc = ursgal.UController(profile=mass_spectrometer, params=params)

    combined_pep_result_files = []
    for n, sample in enumerate(sorted(offsets.keys(), reverse=True)):
        validated_result_files = []
        for search_engine in search_engines:
            engine_results_validated = []
            for n, mod in enumerate(Hvo_Glyco):
                results_one_mod = []
                for spec_file in sorted(offsets[sample].keys()):
                    basename = spec_file
                    dirname = os.path.join(folder)
                    offset = offsets[sample][basename]
                    spec_file_path = os.path.join(dirname, basename)
                    if offset == "skip":
                        continue
                    uc.params["machine_offset_in_ppm"] = offset
                    mgf_file = uc.convert(
                        input_file=spec_file_path,
                        engine="mzml2mgf_2_0_0",
                    )

                    if n == 0:
                        uc.params["modifications"] = [
                            "C,fix,any,Carbamidomethyl",
                            "M,opt,any,Oxidation",
                            "*,fix,N-term,iTRAQ4plex",
                            "K,opt,any,iTRAQ4plex",
                            "Y,opt,any,iTRAQ4plex",
                        ]
                    else:
                        uc.params["modifications"] = [
                            "C,fix,any,Carbamidomethyl",
                            "M,opt,any,Oxidation",
                            "*,fix,N-term,iTRAQ4plex",
                            "K,opt,any,iTRAQ4plex",
                            "Y,opt,any,iTRAQ4plex",
                            "S,opt,any,Hex(2)",
                            "T,opt,any,Hex(2)",
                        ]
                        uc.params["modifications"].append(mod)
                        uc.params["prefix"] = mod.split(",")[3]

                    search_result = uc.search_mgf(
                        input_file=mgf_file,
                        engine=search_engine,
                    )
                    uc.params["prefix"] = ""

                    converted_result = uc.convert(
                        input_file=search_result,
                        guess_engine=True,
                    )

                    mapped_results = uc.execute_misc_engine(
                        input_file=converted_result,
                        engine="upeptide_mapper",
                    )

                    unified_search_results = uc.execute_misc_engine(
                        input_file=mapped_results, engine="unify_csv")

                    results_one_mod.append(unified_search_results)

                uc.params["prefix"] = sample
                merged_1engine_1mod_1sample = uc.execute_misc_engine(
                    input_file=results_one_mod,
                    engine="merge_csvs",
                    # merge_duplicates=True,
                )
                uc.params["prefix"] = ""
                # engine_results_unvalidated.append(merged_1engine_1mod_1sample)

                validated_csv = uc.validate(
                    input_file=merged_1engine_1mod_1sample,
                    engine=validation_engine,
                )
                engine_results_validated.append(validated_csv)

            merged_1engine_all_mods_validated = uc.execute_misc_engine(
                input_file=engine_results_validated,
                engine="merge_csvs",
                merge_duplicates=False,
            )
            validated_result_files.append(merged_1engine_all_mods_validated)

        uc.params["prefix"] = sample
        combined_pep_validated = uc.combine_search_results(
            input_files=validated_result_files,
            engine="combine_pep_1_0_0",
        )
        uc.params["prefix"] = ""
        uc.params["csv_filter_rules"] = [
            # ["Is decoy", "equals", "false"],
            ["combined PEP", "lte", 0.01],
            ["Conflicting uparam", "contains_not", "enzyme"],
        ]
        filtered_validated_results = uc.execute_misc_engine(
            input_file=combined_pep_validated,
            engine="filter_csv",
        )
        combined_pep_result_files.append(filtered_validated_results)

        # uc.params['peptide_forest_initial_engine'] = 'msfragger_2_3'
        # uc.params['peptide_forest_file_params'] = {}
        # uc.params['prefix'] = 'peptide_forest_' + sample
        # peptide_forest_validated =  uc.validate(
        #     input_file=unvalidated_result_files,
        #     engine='peptide_forest',
        # )
        # uc.params['csv_filter_rules'] = [
        #     ['Is decoy', 'equals', 'false'],
        #     ['q-value_RF-reg','lte', 0.01],
        #     ['Conflicting uparam', 'contains_not', 'enzyme'],
        # ]
        # filtered_peptide_forest = uc.execute_misc_engine(
        #     input_file = peptide_forest_validated,
        #     engine='filter_csv',
        # )
        # peptide_forest_result_files.append(filtered_peptide_forest)

    uc.params["prefix"] = ""
    results_all_combined_pep = uc.execute_misc_engine(
        input_file=combined_pep_result_files,
        engine="merge_csvs",
        merge_duplicates=True,
    )

    uc.params.update({
        "validation_score_field": "combined PEP",
        "bigger_scores_better": False,
        "num_compared_psms": 10,
        "accept_conflicting_psms": False,
        "threshold_is_log10": True,
        "score_diff_threshold": 1,
        "psm_defining_colnames": [
            "Spectrum Title",
            "Sequence",
        ],
    })

    sanitized_combined_results = uc.execute_misc_engine(
        input_file=results_all_combined_pep,
        engine="sanitize_csv",
    )

    uc.params["prefix"] = "Glyco_everywhere"
    uc.params["csv_filter_rules"] = [
        ["Modifications", "contains", "Hex"],
        # ['Sequence','contains_glycosite', 'N[^P][ST]']
    ]
    Glyco_filtered = uc.execute_misc_engine(
        input_file=sanitized_combined_results,
        engine="filter_csv",
    )

    uc.params["prefix"] = "Glyco_glycosite"
    uc.params["csv_filter_rules"] = [
        ["Modifications", "contains", "Hex"],
        ["Sequence", "contains_glycosite", "N[^P][ST]"],
    ]
    Glyco_filtered = uc.execute_misc_engine(
        input_file=sanitized_combined_results,
        engine="filter_csv",
    )
    uc.params["prefix"] = ""
Пример #26
0
def analyze(collector):
    '''
    Simle analysis script for the cascade search,
    counting the number of identified peptides (combination of peptide sequence and modifications)
    and PSMs (additionally include the spectrum ID)
    '''

    mod_list = ['Oxidation', 'Deamidated', 'Methyl', 'Acetyl', 'Phospho']
    fieldnames = [
        'approach', 'count_type', 'validation_engine', 'unmodified',
        'multimodified'
    ] + mod_list + ['total']

    csv_writer = csv.DictWriter(open('cascade_results.csv', 'w'), fieldnames)
    csv_writer.writeheader()
    uc = ursgal.UController()
    uc.params['validation_score_field'] = 'PEP'
    uc.params['bigger_scores_better'] = False

    # Count the number of identified peptides and PSMs for the different modifications
    # Spectra with multiple PSMs are sanitized, i.e. only the PSM with best PEP score is counted
    # and only if the best hit has a PEP that is at least two orders of
    # magnitude smaller than the others
    for validation_engine, result_file in collector.items():
        counter_dict = {'psm': ddict(set), 'pep': ddict(set)}
        grouped_psms = uc._group_psms(result_file,
                                      validation_score_field='PEP',
                                      bigger_scores_better=False)
        for spec_title, grouped_psm_list in grouped_psms.items():
            best_score, best_line_dict = grouped_psm_list[0]
            if len(grouped_psm_list) > 1:
                second_best_score, second_best_line_dict = grouped_psm_list[1]
                best_peptide_and_mod = best_line_dict[
                    'Sequence'] + best_line_dict['Modifications']
                second_best_peptide_and_mod = second_best_line_dict[
                    'Sequence'] + second_best_line_dict['Modifications']

                if best_peptide_and_mod == second_best_peptide_and_mod:
                    line_dict = best_line_dict
                elif best_line_dict['Sequence'] == second_best_line_dict[
                        'Sequence']:
                    if best_score == second_best_score:
                        line_dict = best_line_dict
                    else:
                        if (-1 * math.log10(best_score)) - (
                                -1 * math.log10(second_best_score)) >= 2:
                            line_dict = best_line_dict
                        else:
                            continue
                else:
                    if (-1 * math.log10(best_score)) - (
                            -1 * math.log10(second_best_score)) >= 2:
                        line_dict = best_line_dict
                    else:
                        continue
            else:
                line_dict = best_line_dict

            count = 0
            for mod in mod_list:
                if mod in line_dict['Modifications']:
                    count += 1
            key_2_add = ''
            if count == 0:
                key_2_add = 'unmodified'
            elif count >= 2:
                key_2_add = 'multimodified'
            elif count == 1:
                for mod in mod_list:
                    if mod in line_dict['Modifications']:
                        key_2_add = mod
                        break
            # for peptide identification comparison
            counter_dict['pep'][key_2_add].add(line_dict['Sequence'] +
                                               line_dict['Modifications'])
            # for PSM comparison
            counter_dict['psm'][key_2_add].add(line_dict['Spectrum Title'] +
                                               line_dict['Sequence'] +
                                               line_dict['Modifications'])
        for counter_key, count_dict in counter_dict.items():
            dict_2_write = {
                'approach': 'cascade',
                'count_type': counter_key,
                'validation_engine': validation_engine
            }
            total_number = 0
            for key, obj_set in count_dict.items():
                dict_2_write[key] = len(obj_set)
                total_number += len(obj_set)
            dict_2_write['total'] = total_number
            csv_writer.writerow(dict_2_write)
    return
Пример #27
0
# encoding: utf-8
'''

Test the unify_csv function for myrimatch engine

'''
import ursgal
import csv
import pickle
import os

modifications = [
    'C,fix,any,Carbamidomethyl',  # Carbamidomethylation
]

R = ursgal.UController(params={'modifications': modifications})
R.map_mods()

scan_rt_lookup = pickle.load(
    open(os.path.join('tests', 'data', '_test_ursgal_lookup.pkl'), 'rb'))

unify_csv_main = R.unodes['unify_csv_1_0_0'][
    'class'].import_engine_as_python_function()
input_csv = os.path.join('tests', 'data', 'myrimatch_2_1_138',
                         'test_BSA1_myrimatch_2_1_138.csv')
output_csv = os.path.join('tests', 'data', 'myrimatch_2_1_138',
                          'test_BSA1_myrimatch_2_1_138_unified.csv')
unify_csv_main(
    input_file=input_csv,
    output_file=output_csv,
    scan_rt_lookup=scan_rt_lookup,
Пример #28
0
def search(validation_engine):
    '''
    Executes a cascade search on four example files from the 
    data from Barth et al.

    usage:
        ./cascade_search_example.py

    Searches for peptides using a cascade search approach similar to Kertesz-Farkas et al.
    for which spectra were first searched for unmodified peptides, followed by consecutive searches
    for the following modifications: 
    oxidation of M,
    deamidation of N/Q,
    methylation of E/K/R,
    N-terminal acetylation,
    phosphorylation of S/T.
    After each step, spectra with a PSM below 1 % PEP were removed. 
    '''
    # Initializing the uPLANIT UController class with
    # our specified modifications and mass spectrometer
    uc = ursgal.UController(
        profile=mass_spectrometer,  # 'LTQ XL low res' profile!
        params=params)

    # complete workflow for every level of the cascade:
    # every spectrum file is searched with every search engine,
    # results are validated seperately,
    # validated results are merged and filtered for targets and PEP <= 0.01.

    def workflow(spec_file,
                 prefix=None,
                 validation_engine=None,
                 filter_before_validation=False,
                 force=False):
        validated_results = []

        # Convert mzML to MGF outside the loop, so this step is not repeated in
        # the loop
        mgf_spec_file = uc.convert(input_file=spec_file,
                                   engine='mzml2mgf_1_0_0')
        for search_engine in search_engines:
            uc.params['prefix'] = prefix
            unified_search_results = uc.search(
                input_file=mgf_spec_file,
                engine=search_engine,
                force=force,
            )
            uc.params['prefix'] = ''

            if filter_before_validation == True:
                uc.params['csv_filter_rules'] = [[
                    'Modifications', 'contains',
                    '{0}'.format(cascade[level][1].split(',')[3])
                ]]
                filtered_search_results = uc.execute_misc_engine(
                    input_file=unified_search_results,
                    engine='filter_csv_1_0_0')
            else:
                filtered_search_results = unified_search_results
            validated_search_results = uc.validate(
                input_file=filtered_search_results,
                engine=validation_engine,
                force=force,
            )
            validated_results.append(validated_search_results)

        validated_results_from_all_engines = uc.execute_misc_engine(
            input_file=sorted(validated_results),
            engine='merge_csvs_1_0_0',
            force=force,
        )
        uc.params['csv_filter_rules'] = [
            ['Is decoy', 'equals', 'false'],
            ['PEP', 'lte', 0.01],
        ]
        filtered_validated_results = uc.execute_misc_engine(
            input_file=validated_results_from_all_engines,
            engine='filter_csv_1_0_0')
        return filtered_validated_results

    result_files = []
    for spec_file in spec_files:
        spectra_with_PSM = set()
        for level in sorted(cascade.keys()):
            uc.params['modifications'] = cascade[level]
            if level == '0':
                results = workflow(spec_file,
                                   validation_engine=validation_engine,
                                   prefix='cascade-lvl-{0}'.format(level))
            else:
                uc.params['scan_exclusion_list'] = list(spectra_with_PSM)
                results = workflow(spec_file,
                                   validation_engine=validation_engine,
                                   filter_before_validation=True,
                                   force=True,
                                   prefix='cascade-lvl-{0}'.format(level))
            result_files.append(results)
            #  spectrum IDs for PSMs are written into an exclusion list for the next level of the cascade search,
            #  these spectra will b excluded during mzml2mgf conversion
            with open(results) as in_file:
                csv_input = csv.DictReader(in_file)
                for line_dict in csv_input:
                    spectra_with_PSM.add(line_dict['Spectrum ID'])
            print(
                'Number of spectra that will be removed for the next cacade level: {0}'
                .format(len(spectra_with_PSM)))

    results_all_files = uc.execute_misc_engine(
        input_file=sorted(result_files),
        engine='merge_csvs_1_0_0',
    )
    return results_all_files
Пример #29
0
#!/usr/bin/env python3.4
# encoding: utf-8
'''

Test the unify_csv function for omssa engine

'''
import ursgal
import csv
import pickle
import os

R = ursgal.UController(profile='LTQ XL low res',
                       params={
                           'database': os.path.join('tests', 'data',
                                                    'BSA.fasta'),
                       },
                       force=False)

scan_rt_lookup = pickle.load(
    open(os.path.join('tests', 'data', '_test_ursgal_lookup.pkl'), 'rb'))

unify_csv_main = R.unodes['unify_csv_1_0_0'][
    'class'].import_engine_as_python_function()

input_csv = os.path.join('tests', 'data', 'omssa_2_1_9',
                         'test_BSA1_omssa_2_1_9.csv')
output_csv = os.path.join('tests', 'data', 'omssa_2_1_9',
                          'test_BSA1_omssa_2_1_9_unified.csv')

unify_csv_main(
Пример #30
0
def main():
    '''
    Executes a search with OMSSA, XTandem and MS-GF+ on the BSA1.mzML
    input_file

    usage:
        ./simple_example_search.py

    Note:
        Myrimatch does not work with this file.
        To use MSAmanda on unix platforms, please install mono 
        (http://www.mono-project.com/download)

    '''
    uc = ursgal.UController(
        profile='LTQ XL low res',
        params={
            'database':
            os.path.join(os.pardir, 'example_data', 'BSA.fasta'),
            'modifications': [
                'M,opt,any,Oxidation',  # Met oxidation
                'C,fix,any,Carbamidomethyl',  # Carbamidomethylation
                '*,opt,Prot-N-term,Acetyl',  # N-Acteylation
            ],
            # 'peptide_mapper_class_version' : 'UPeptideMapper_v2',
        })

    if sys.maxsize > 2**32:
        xtandem = 'xtandem_vengeance'
    else:
        xtandem = 'xtandem_sledgehammer'

    engine_list = [
        'omssa',
        xtandem,
        'msgfplus_v2016_09_16',
    ]

    mzML_file = os.path.join(os.pardir, 'example_data',
                             'BSA_simple_example_search', 'BSA1.mzML')
    if os.path.exists(mzML_file) is False:
        uc.params[
            'http_url'] = 'http://sourceforge.net/p/open-ms/code/HEAD/tree/OpenMS/share/OpenMS/examples/BSA/BSA1.mzML?format=raw'
        uc.params['http_output_folder'] = os.path.dirname(mzML_file)
        uc.fetch_file(engine='get_http_files_1_0_0', )
        try:
            shutil.move('{0}?format=raw'.format(mzML_file), mzML_file)
        except:
            shutil.move('{0}format=raw'.format(mzML_file), mzML_file)

    unified_file_list = []

    for engine in engine_list:
        unified_search_result_file = uc.search(input_file=mzML_file,
                                               engine=engine,
                                               force=False)
        unified_file_list.append(unified_search_result_file)

    uc.visualize(
        input_files=unified_file_list,
        engine='venndiagram',
    )
    return