def use(self, sequence):
        '''Re-initialize the class with a new sequence

        This is helpful if one wants to use the same class instance
        for multiple sequence since it remove class instantiation overhead.

        Args:
            sequence (str): See top for possible input formats.
        '''

        self.clear()
        # reset the shiznit
        if '#' in sequence:
            # Unimod Style format
            if self._unimod_parser is None:
                self._unimod_parser = ursgal.UnimodMapper()
            self._parse_sequence_unimod_style(sequence)
        else:
            self._parse_sequence_old_style(sequence)
示例#2
0
    def preflight(self):
        '''
        Formatting the command line and writing the param input file via 
        self.params

        Returns:
                dict: self.params
        '''
        self.input_file = os.path.join(self.params['input_dir_path'],
                                       self.params['input_file'])

        self.param_file_name = os.path.join(
            self.params['output_dir_path'],
            '{0}_msfragger.params'.format(self.input_file))
        self.created_tmp_files.append(self.param_file_name)
        # further prepare and translate params

        # pprint.pprint(self.params['translations']['_grouped_by_translated_key'])
        # pprint.pprint(self.params)
        # exit()
        self.params_to_write = {
            'output_file_extension':
            'tsv',  # tsv or pepXML we fix it...
            'output_format':
            'tsv',  # pepXML or tsv
            'digest_mass_range':
            '{0} {1}'.format(
                self.params['translations']['_grouped_by_translated_key']
                ['precursor_min_mass']['precursor_min_mass'],
                self.params['translations']['_grouped_by_translated_key']
                ['precursor_max_mass']['precursor_max_mass'])
        }

        write_exclusion_list = [
            'precursor_min_mass', 'precursor_max_mass', 'precursor_min_charge',
            'precursor_max_charge', 'label', '-Xmx', 'header_translations',
            'validation_score_field'
        ]

        additional_15N_modifications = []
        if self.params['translations']['_grouped_by_translated_key']['label'][
                'label'] == '15N':
            self.print_info(
                'Search with label=15N may still be errorprone. Evaluate with care!',
                caller='WARNING')
            for aminoacid, N15_Diff in ursgal.ukb.DICT_15N_DIFF.items():
                existing = False
                for mod_dict in self.params['mods']['fix']:
                    if aminoacid == mod_dict['aa']:
                        mod_dict['mass'] += N15_Diff
                        mod_dict['name'] += '_15N_{0}'.format(aminoacid)
                        existing = True
                if existing == True:
                    continue
                else:
                    mod_key = 'add_{0}_{1}'.format(
                        aminoacid,
                        ursgal.chemical_composition_kb.aa_names[aminoacid])
                    self.params_to_write[mod_key] = N15_Diff

        self.mass_shift_lookup = {}
        self.mass_glycan_lookup = {}
        for msfragger_param_name in self.params['translations'][
                '_grouped_by_translated_key'].keys():
            for ursgal_param_name, param_value in self.params['translations'][
                    '_grouped_by_translated_key'][msfragger_param_name].items(
                    ):
                if msfragger_param_name in write_exclusion_list:
                    continue
                elif msfragger_param_name == 'enzyme':
                    '''
                    search_enzyme_name = Trypsin
                    search_enzyme_cutafter = KR
                    search_enzyme_butnotafter = P
                    '''
                    aa_site, term, inhibitor = param_value.split(';')
                    self.params_to_write['search_enzyme_name'] = self.params[
                        'enzyme']
                    self.params_to_write['search_enzyme_cutafter'] = aa_site
                    self.params_to_write[
                        'search_enzyme_butnotafter'] = inhibitor
                elif msfragger_param_name == 'num_enzyme_termini':
                    # num_enzyme_termini = 2 # 2 for enzymatic, 1 for
                    # semi-enzymatic, 0 for nonspecific digestion

                    if self.params['translations'][
                            '_grouped_by_translated_key']['enzyme'][
                                'enzyme'] == 'nonspecific':
                        self.params_to_write[msfragger_param_name] = 0
                    else:
                        self.params_to_write[
                            msfragger_param_name] = param_value
                elif msfragger_param_name == 'clear_mz_range':
                    min_mz, max_mz = param_value
                    self.params_to_write[
                        msfragger_param_name] = '{0} {1}'.format(
                            min_mz, max_mz)
                elif msfragger_param_name == 'remove_precursor_range':
                    min_mz, max_mz = param_value
                    self.params_to_write[
                        msfragger_param_name] = '{0},{1}'.format(
                            min_mz, max_mz)
                elif msfragger_param_name == 'delta_mass_exclude_ranges':
                    min_mz, max_mz = param_value
                    self.params_to_write[
                        msfragger_param_name] = '({0},{1})'.format(
                            min_mz, max_mz)
                elif msfragger_param_name == 'precursor_mass_lower':
                    self.params_to_write[
                        msfragger_param_name] = -1 * param_value
                elif msfragger_param_name == 'modifications':
                    '''
                    #maximum of 7 mods - amino acid codes, * for any amino acid, [ and ] specifies protein termini, n and c specifies peptide termini
                    variable_mod_01 = 15.9949 M
                    variable_mod_02 = 42.0106 [*
                    #variable_mod_03 = 79.96633 STY
                    #variable_mod_03 = -17.0265 nQnC
                    #variable_mod_04 = -18.0106 nE
                    '''
                    # print(self.params['translations']['_grouped_by_translated_key'][msfragger_param_name])
                    # pprint.pprint(self.params[ 'mods' ])
                    # exit()
                    mass_to_mod_aa = ddict(list)
                    for mod_dict in self.params['mods']['opt']:
                        '''
                        {'_id': 0,
                          'aa': '*',
                          'composition': {'C': 2, 'H': 2, 'O': 1},
                          'id': '1',
                          'mass': 42.010565,
                          'name': 'Acetyl',
                          'org': '*,opt,Prot-N-term,Acetyl',
                          'pos': 'Prot-N-term',
                          'unimod': True},
                        '''
                        aa_to_append = mod_dict['aa']
                        pos_modifier = None
                        if mod_dict['pos'] == 'Prot-N-term':
                            pos_modifier = '['
                        elif mod_dict['pos'] == 'Prot-C-term':
                            pos_modifier = ']'
                        elif mod_dict['pos'] == 'N-term':
                            pos_modifier = 'n'
                        elif mod_dict['pos'] == 'C-term':
                            pos_modifier = 'c'
                        elif mod_dict['pos'] == 'any':
                            pass
                        else:
                            print('''
                            Unknown positional argument for given modification:
                            {0}
                            MSFragger cannot deal with this, please use one of the follwing:
                            any, Prot-N-term, Prot-C-term, N-term, C-term
                            '''.format(mod_dict['org']))
                            sys.exit(1)
                        if pos_modifier is not None:
                            aa_to_append = '{0}{1}'.format(
                                pos_modifier, aa_to_append)
                        mass_to_mod_aa[mod_dict['mass']].append(aa_to_append)
                    for pos, (mass,
                              aa_list) in enumerate(mass_to_mod_aa.items()):
                        self.params_to_write['variable_mod_0{0}'.format(
                            pos + 1)] = '{0} {1}'.format(
                                mass, ''.join(aa_list))
                    for mod_dict in self.params['mods']['fix']:
                        '''
                        add_C_cysteine = 57.021464             # added to C - avg. 103.1429, mono. 103.00918
                        '''
                        if mod_dict['pos'] == 'Prot-N-term':
                            mod_key = 'add_Nterm_protein'
                        elif mod_dict['pos'] == 'Prot-C-term':
                            mod_key = 'add_Cterm_protein'
                        elif mod_dict['pos'] == 'N-term':
                            mod_key = 'add_Nterm_peptide'
                        elif mod_dict['pos'] == 'C-term':
                            mod_key = 'add_Cterm_peptide'
                        else:
                            mod_key = 'add_{0}_{1}'.format(
                                mod_dict['aa'],
                                ursgal.chemical_composition_kb.aa_names[
                                    mod_dict['aa']])
                        self.params_to_write[mod_key] = mod_dict['mass']

                elif msfragger_param_name == 'override_charge':
                    self.params_to_write[msfragger_param_name] = param_value
                    if param_value == 1:
                        self.params_to_write[
                            'precursor_charge'] = '{0} {1}'.format(
                                self.params['translations']
                                ['_grouped_by_translated_key']
                                ['precursor_min_charge']
                                ['precursor_min_charge'],
                                self.params['translations']
                                ['_grouped_by_translated_key']
                                ['precursor_max_charge']
                                ['precursor_max_charge'])
                elif msfragger_param_name == 'fragment_ion_series':
                    ion_list = []
                    for ion in param_value:
                        if ion not in [
                                'a',
                                'b',
                                'c',
                                'y~',
                                'x',
                                'y',
                                'z',
                                'b~',
                                'y-18',
                                'b-18',
                                'Y',
                        ]:
                            print('''
                                [ WARNING ] MSFragger does not allow the following ion:
                                {0}
                                This ion will be skipped, i.e. not included in the search.
                            '''.format(ion))
                            continue
                        ion_list.append(ion)
                    self.params_to_write[msfragger_param_name] = ','.join(
                        ion_list)
                elif msfragger_param_name in [
                        'mass_offsets',
                        'Y_type_masses',
                ]:
                    cc = ursgal.ChemicalComposition()
                    umama = ursgal.UnimodMapper()
                    masses = []
                    for m in param_value['masses']:
                        masses.append(str(m))
                    for m in param_value['glycans']:
                        cc.clear()
                        cc.add_glycan(m)
                        mass = cc._mass()
                        masses.append(str(mass))
                        # for tm in self.transform_mass_add_error(mass):
                        tm = round(mass * 1e5)
                        if tm not in self.mass_glycan_lookup.keys():
                            self.mass_glycan_lookup[tm] = set()
                        self.mass_glycan_lookup[tm].add(m)
                    for m in param_value['chemical_formulas']:
                        cc.clear()
                        cc.add_chemical_formula(m)
                        mass = cc._mass()
                        masses.append(str(mass))
                        # for tm in self.transform_mass_add_error(mass):
                        tm = round(mass * 1e5)
                        if tm not in self.mass_shift_lookup.keys():
                            self.mass_shift_lookup[tm] = set()
                        self.mass_shift_lookup[tm].add(m)
                    for m in param_value['unimods']:
                        unimod_mass = umama.name2mass(m)
                        masses.append(str(unimod_mass))
                        # for tm in self.transform_mass_add_error(unimod_mass):
                        tm = round(mass * 1e5)
                        if tm not in self.mass_shift_lookup.keys():
                            self.mass_shift_lookup[tm] = set()
                        self.mass_shift_lookup[tm].add(m)
                    self.params_to_write[msfragger_param_name] = '/'.join(
                        masses)
                elif msfragger_param_name == 'diagnostic_fragments':
                    cc = ursgal.ChemicalComposition()
                    umama = ursgal.UnimodMapper()
                    masses = []
                    for m in param_value['masses']:
                        masses.append(m)
                    for m in param_value['glycans']:
                        cc.clear()
                        cc.add_glycan(m)
                        masses.append(cc._mass())
                    for m in param_value['chemical_formulas']:
                        cc.clear()
                        cc.add_chemical_formula(m)
                        masses.append(cc._mass())
                    for m in param_value['unimods']:
                        unimod_mass = umama.name2mass(m)
                        masses.append(unimod_mass)
                    mzs = []
                    for mass in masses:
                        mzs.append(str(ursgal.ucore.calculate_mz(mass, 1)))
                    self.params_to_write[msfragger_param_name] = '/'.join(mzs)
                else:
                    self.params_to_write[msfragger_param_name] = param_value

        self.write_params_file()

        if self.input_file.lower().endswith('.mzml') or \
                self.input_file.lower().endswith('.mzml.gz') or \
                self.input_file.lower().endswith('.mgf'):
            self.params['translations']['mzml_input_file'] = self.input_file
        # elif self.input_file.lower().endswith('.mgf'):
        #     self.params['translations']['mzml_input_file'] = \
        #         self.meta_unodes['ucontroller'].get_mzml_that_corresponds_to_mgf( self.input_file )
        #     self.print_info(
        #         'MSFragger can only read Proteowizard MGF input files,'
        #         'the corresponding mzML file {0} will be used instead.'.format(
        #             os.path.abspath(self.params['translations']['mzml_input_file'])
        #         ),
        #         caller = "INFO"
        #     )
        else:
            raise Exception(
                'MSFragger input spectrum file must be in mzML or MGF format!')

        self.params['command_list'] = [
            'java',
            '-Xmx{0}'.format(self.params['translations']
                             ['_grouped_by_translated_key']['-Xmx']['-xmx']),
            '-jar', self.exe, self.param_file_name,
            self.params['translations']['mzml_input_file']
        ]

        self.params['translations']['output_file_incl_path'] = os.path.join(
            self.params['output_dir_path'], self.params['output_file'])
        return self.params
示例#3
0
    def preflight(self):
        """
        Formatting the command line and writing the param input file via
        self.params

        Returns:
                dict: self.params
        """
        self.input_file = os.path.join(self.params["input_dir_path"],
                                       self.params["input_file"])

        self.param_file_name = os.path.join(
            self.params["output_dir_path"],
            "{0}_msfragger.params".format(self.input_file),
        )
        self.created_tmp_files.append(self.param_file_name)
        # further prepare and translate params

        # pprint.pprint(self.params['translations']['_grouped_by_translated_key'])
        # pprint.pprint(self.params)
        # exit()
        self.params_to_write = {
            "output_file_extension":
            "tsv",  # tsv or pepXML we fix it...
            "output_format":
            "tsv",  # pepXML or tsv
            "digest_mass_range":
            "{0} {1}".format(
                self.params["translations"]["_grouped_by_translated_key"]
                ["precursor_min_mass"]["precursor_min_mass"],
                self.params["translations"]["_grouped_by_translated_key"]
                ["precursor_max_mass"]["precursor_max_mass"],
            ),
        }

        write_exclusion_list = [
            "precursor_min_mass",
            "precursor_max_mass",
            "precursor_min_charge",
            "precursor_max_charge",
            "label",
            "-Xmx",
            "header_translations",
            "validation_score_field",
        ]

        additional_15N_modifications = []
        if (self.params["translations"]["_grouped_by_translated_key"]["label"]
            ["label"] == "15N"):
            self.print_info(
                "Search with label=15N may still be errorprone. Evaluate with care!",
                caller="WARNING",
            )
            for aminoacid, N15_Diff in ursgal.ukb.DICT_15N_DIFF.items():
                existing = False
                for mod_dict in self.params["mods"]["fix"]:
                    if aminoacid == mod_dict["aa"]:
                        mod_dict["mass"] += N15_Diff
                        mod_dict["name"] += "_15N_{0}".format(aminoacid)
                        existing = True
                if existing == True:
                    continue
                else:
                    mod_key = "add_{0}_{1}".format(
                        aminoacid,
                        ursgal.chemical_composition_kb.aa_names[aminoacid])
                    self.params_to_write[mod_key] = N15_Diff

        self.mass_shift_lookup = {}
        self.mass_glycan_lookup = {}
        for msfragger_param_name in self.params["translations"][
                "_grouped_by_translated_key"].keys():
            for ursgal_param_name, param_value in self.params["translations"][
                    "_grouped_by_translated_key"][msfragger_param_name].items(
                    ):
                if msfragger_param_name in write_exclusion_list:
                    continue
                elif msfragger_param_name == "enzyme":
                    """
                    search_enzyme_name = Trypsin
                    search_enzyme_cutafter = KR
                    search_enzyme_butnotafter = P
                    """
                    aa_site, term, inhibitor = param_value.split(";")
                    self.params_to_write["search_enzyme_name"] = self.params[
                        "enzyme"]
                    self.params_to_write["search_enzyme_cutafter"] = aa_site
                    self.params_to_write[
                        "search_enzyme_butnotafter"] = inhibitor
                elif msfragger_param_name == "num_enzyme_termini":
                    # num_enzyme_termini = 2 # 2 for enzymatic, 1 for
                    # semi-enzymatic, 0 for nonspecific digestion

                    if (self.params["translations"]
                        ["_grouped_by_translated_key"]["enzyme"]["enzyme"] ==
                            "nonspecific"):
                        self.params_to_write[msfragger_param_name] = 0
                    else:
                        self.params_to_write[
                            msfragger_param_name] = param_value
                elif msfragger_param_name == "clear_mz_range":
                    min_mz, max_mz = param_value
                    self.params_to_write[
                        msfragger_param_name] = "{0} {1}".format(
                            min_mz, max_mz)
                elif msfragger_param_name == "remove_precursor_range":
                    min_mz, max_mz = param_value
                    self.params_to_write[
                        msfragger_param_name] = "{0},{1}".format(
                            min_mz, max_mz)
                elif msfragger_param_name == "delta_mass_exclude_ranges":
                    min_mz, max_mz = param_value
                    self.params_to_write[
                        msfragger_param_name] = "({0},{1})".format(
                            min_mz, max_mz)
                elif msfragger_param_name == "precursor_mass_lower":
                    self.params_to_write[
                        msfragger_param_name] = -1 * param_value
                elif msfragger_param_name == "modifications":
                    """
                    #maximum of 7 mods - amino acid codes, * for any amino acid, [ and ] specifies protein termini, n and c specifies peptide termini
                    variable_mod_01 = 15.9949 M
                    variable_mod_02 = 42.0106 [*
                    #variable_mod_03 = 79.96633 STY
                    #variable_mod_03 = -17.0265 nQnC
                    #variable_mod_04 = -18.0106 nE
                    """
                    # print(self.params['translations']['_grouped_by_translated_key'][msfragger_param_name])
                    # pprint.pprint(self.params[ 'mods' ])
                    # exit()
                    mass_to_mod_aa = ddict(list)
                    for mod_dict in self.params["mods"]["opt"]:
                        """
                        {'_id': 0,
                          'aa': '*',
                          'composition': {'C': 2, 'H': 2, 'O': 1},
                          'id': '1',
                          'mass': 42.010565,
                          'name': 'Acetyl',
                          'org': '*,opt,Prot-N-term,Acetyl',
                          'pos': 'Prot-N-term',
                          'unimod': True},
                        """
                        aa_to_append = mod_dict["aa"]
                        pos_modifier = None
                        if mod_dict["pos"] == "Prot-N-term":
                            pos_modifier = "["
                        elif mod_dict["pos"] == "Prot-C-term":
                            pos_modifier = "]"
                        elif mod_dict["pos"] == "N-term":
                            pos_modifier = "n"
                        elif mod_dict["pos"] == "C-term":
                            pos_modifier = "c"
                        elif mod_dict["pos"] == "any":
                            pass
                        else:
                            print("""
                            Unknown positional argument for given modification:
                            {0}
                            MSFragger cannot deal with this, please use one of the follwing:
                            any, Prot-N-term, Prot-C-term, N-term, C-term
                            """.format(mod_dict["org"]))
                            sys.exit(1)
                        if pos_modifier is not None:
                            aa_to_append = "{0}{1}".format(
                                pos_modifier, aa_to_append)
                        mass_to_mod_aa[mod_dict["mass"]].append(aa_to_append)
                    for pos, (mass,
                              aa_list) in enumerate(mass_to_mod_aa.items()):
                        self.params_to_write["variable_mod_0{0}".format(
                            pos + 1)] = "{0} {1}".format(
                                mass, "".join(aa_list))
                    for mod_dict in self.params["mods"]["fix"]:
                        """
                        add_C_cysteine = 57.021464             # added to C - avg. 103.1429, mono. 103.00918
                        """
                        if mod_dict["pos"] == "Prot-N-term":
                            mod_key = "add_Nterm_protein"
                        elif mod_dict["pos"] == "Prot-C-term":
                            mod_key = "add_Cterm_protein"
                        elif mod_dict["pos"] == "N-term":
                            mod_key = "add_Nterm_peptide"
                        elif mod_dict["pos"] == "C-term":
                            mod_key = "add_Cterm_peptide"
                        else:
                            mod_key = "add_{0}_{1}".format(
                                mod_dict["aa"],
                                ursgal.chemical_composition_kb.aa_names[
                                    mod_dict["aa"]],
                            )
                        self.params_to_write[mod_key] = mod_dict["mass"]

                elif msfragger_param_name == "override_charge":
                    self.params_to_write[msfragger_param_name] = param_value
                    if param_value == 1:
                        self.params_to_write[
                            "precursor_charge"] = "{0} {1}".format(
                                self.params["translations"]
                                ["_grouped_by_translated_key"]
                                ["precursor_min_charge"]
                                ["precursor_min_charge"],
                                self.params["translations"]
                                ["_grouped_by_translated_key"]
                                ["precursor_max_charge"]
                                ["precursor_max_charge"],
                            )
                elif msfragger_param_name == "fragment_ion_series":
                    ion_list = []
                    for ion in param_value:
                        if ion not in [
                                "a",
                                "b",
                                "c",
                                "y~",
                                "x",
                                "y",
                                "z",
                                "b~",
                                "y-18",
                                "b-18",
                                "Y",
                        ]:
                            print("""
                                [ WARNING ] MSFragger does not allow the following ion:
                                {0}
                                This ion will be skipped, i.e. not included in the search.
                            """.format(ion))
                            continue
                        ion_list.append(ion)
                    self.params_to_write[msfragger_param_name] = ",".join(
                        ion_list)
                elif msfragger_param_name in [
                        "mass_offsets",
                        "Y_type_masses",
                ]:
                    cc = ursgal.ChemicalComposition()
                    umama = ursgal.UnimodMapper()
                    masses = []
                    for m in param_value["masses"]:
                        masses.append(str(m))
                    for m in param_value["glycans"]:
                        cc.clear()
                        cc.add_glycan(m)
                        mass = cc._mass()
                        masses.append(str(mass))
                        # for tm in self.transform_mass_add_error(mass):
                        tm = round(mass * 1e5)
                        if tm not in self.mass_glycan_lookup.keys():
                            self.mass_glycan_lookup[tm] = set()
                        self.mass_glycan_lookup[tm].add(m)
                    for m in param_value["chemical_formulas"]:
                        cc.clear()
                        cc.add_chemical_formula(m)
                        mass = cc._mass()
                        masses.append(str(mass))
                        # for tm in self.transform_mass_add_error(mass):
                        tm = round(mass * 1e5)
                        if tm not in self.mass_shift_lookup.keys():
                            self.mass_shift_lookup[tm] = set()
                        self.mass_shift_lookup[tm].add(m)
                    for m in param_value["unimods"]:
                        unimod_mass = umama.name2mass(m)
                        masses.append(str(unimod_mass))
                        # for tm in self.transform_mass_add_error(unimod_mass):
                        tm = round(mass * 1e5)
                        if tm not in self.mass_shift_lookup.keys():
                            self.mass_shift_lookup[tm] = set()
                        self.mass_shift_lookup[tm].add(m)
                    self.params_to_write[msfragger_param_name] = "/".join(
                        masses)
                elif msfragger_param_name == "diagnostic_fragments":
                    cc = ursgal.ChemicalComposition()
                    umama = ursgal.UnimodMapper()
                    masses = []
                    for m in param_value["masses"]:
                        masses.append(m)
                    for m in param_value["glycans"]:
                        cc.clear()
                        cc.add_glycan(m)
                        masses.append(cc._mass())
                    for m in param_value["chemical_formulas"]:
                        cc.clear()
                        cc.add_chemical_formula(m)
                        masses.append(cc._mass())
                    for m in param_value["unimods"]:
                        unimod_mass = umama.name2mass(m)
                        masses.append(unimod_mass)
                    mzs = []
                    for mass in masses:
                        mzs.append(str(ursgal.ucore.calculate_mz(mass, 1)))
                    self.params_to_write[msfragger_param_name] = "/".join(mzs)
                else:
                    self.params_to_write[msfragger_param_name] = param_value

        self.write_params_file()

        if (self.input_file.lower().endswith(".mzml")
                or self.input_file.lower().endswith(".mzml.gz")
                or self.input_file.lower().endswith(".mgf")):
            self.params["translations"]["mzml_input_file"] = self.input_file
        # elif self.input_file.lower().endswith('.mgf'):
        #     self.params['translations']['mzml_input_file'] = \
        #         self.meta_unodes['ucontroller'].get_mzml_that_corresponds_to_mgf( self.input_file )
        #     self.print_info(
        #         'MSFragger can only read Proteowizard MGF input files,'
        #         'the corresponding mzML file {0} will be used instead.'.format(
        #             os.path.abspath(self.params['translations']['mzml_input_file'])
        #         ),
        #         caller = "INFO"
        #     )
        else:
            raise Exception(
                "MSFragger input spectrum file must be in mzML or MGF format!")

        self.params["command_list"] = [
            "java",
            "-Xmx{0}".format(self.params["translations"]
                             ["_grouped_by_translated_key"]["-Xmx"]["-xmx"]),
            "-jar",
            self.exe,
            self.param_file_name,
            self.params["translations"]["mzml_input_file"],
        ]

        self.params["translations"]["output_file_incl_path"] = os.path.join(
            self.params["output_dir_path"], self.params["output_file"])
        return self.params
示例#4
0
    def write_input_tsv(self, input_csv, tmp_dir):
        '''
        convert ursgal csv into PTM-Shepherd input tsv
        (same format as Philosopher output psms tsv)
        '''
        print('[ PREFLGHT ] writing PTM-Shepherd config file ...')
        ptmshep_input = os.path.join(
            tmp_dir,
            os.path.basename(input_csv).replace('.csv', '.tsv'))
        # mod pattern
        mod_pattern = re.compile(r'''(?P<name>.*):(?P<pos>[0-9]*$)''')

        #convert csv file into PTM-Shepherd input file
        fieldnames_list = [
            'Spectrum',
            'Peptide',
            'Modified Peptide',
            'Peptide Length',
            'Charge',
            'Retention',
            'Calculated Peptide Mass',
            'Calculated M/Z',
            'Delta Mass',
            'Assigned Modifications',
            'Is Unique',
        ]
        umama = ursgal.UnimodMapper()
        scan_rt_lookup_path = os.path.join(
            os.path.dirname(
                self.params['translations']['mzml_input_files'][0]),
            '_ursgal_lookup.pkl')
        # with open(scan_rt_lookup_path, 'rb') as scan_rt_in:
        #     scan_rt_lookup_dict = pickle.load(scan_rt_in)

        with open(ptmshep_input, 'w', newline='') as new_csvfile, \
            open(self.params['translations']['csv_input_file'], 'r') as csv_file:
            writer = csv.DictWriter(new_csvfile,
                                    fieldnames=fieldnames_list,
                                    delimiter='\t')
            writer.writeheader()
            csv_reader = csv.DictReader(csv_file)
            for n, row in enumerate(csv_reader):
                if n % 500 == 0:
                    print('[ PREFLGHT ] Processing line number: {0}'.format(n),
                          end='\r')
                mass_diffs = row['Mass Difference'].split(';')
                mass_diffs_sum = 0.0
                for n, mass in enumerate(mass_diffs):
                    if mass == '':
                        continue
                    if 'moda' in row['Search Engine'] and mass.startswith('+'):
                        exp_mass = ursgal.ucore.calculate_mass(
                            float(row['Exp m/z']), int(row['Charge']))
                        mass = '{}'.format(exp_mass - float(row['uCalc Mass']))
                    mass_diffs_sum += float(mass.split(':')[0])

                if '<|>' in row['Protein ID']:
                    is_unique = 'false'
                else:
                    is_unique = 'true'

                rt = row.get('Retention Time (s)', '')
                if rt == '':
                    spectrum_id = int(row['Spectrum ID'])
                    raw_file_name = os.path.basename(row['Raw data location'])
                    input_file_basename_for_rt_lookup = raw_file_name.replace(
                        '.mgf', '')
                    retention_time_in_minutes = \
                        scan_rt_lookup_dict[input_file_basename_for_rt_lookup]['scan_2_rt']\
                            [spectrum_id]
                    row['Retention Time (s)'] = retention_time_in_minutes * 60
                assert row['Retention Time (s)'] != '', '''
                [ERROR] Retention Time needs to be given for each row.
                '''

                tmp_dict = {}
                tmp_dict['Spectrum'] = row['Spectrum Title']
                tmp_dict['Peptide'] = row['Sequence']
                tmp_dict['Modified Peptide'] = row['Sequence']
                tmp_dict['Peptide Length'] = len(row['Sequence'])
                tmp_dict['Charge'] = row['Charge']
                tmp_dict['Retention'] = row['Retention Time (s)']
                tmp_dict['Calculated Peptide Mass'] = row['uCalc Mass']
                tmp_dict['Calculated M/Z'] = row['uCalc m/z']
                tmp_dict['Delta Mass'] = mass_diffs_sum
                tmp_dict['Is Unique'] = is_unique

                mods = row['Modifications'].split(';')
                new_mod_list = []
                for mod in mods:
                    if mod == '':
                        continue
                    match = mod_pattern.search(mod)
                    pos = int(match.group('pos'))
                    if pos == 0:
                        aa = row['Sequence'][pos]
                    else:
                        aa = row['Sequence'][pos - 1]
                    mass = umama.name2mass(match.group('name'))
                    mod_entry = '{0}{1}({2})'.format(pos, aa, mass)
                    new_mod_list.append(mod_entry)
                tmp_dict['Assigned Modifications'] = ', '.join(new_mod_list)
                writer.writerow(tmp_dict)
        print('[ PREFLGHT ] Processing done')
        return ptmshep_input
示例#5
0
def main(input_file=None,
         output_file=None,
         score_column_name=None,
         score_type=None):
    '''
    Convert csvs to ssl
    '''

    umama = ursgal.UnimodMapper()

    csv_kwargs = {}
    if sys.platform == 'win32':
        csv_kwargs['lineterminator'] = '\n'
    else:
        csv_kwargs['lineterminator'] = '\r\n'

    output_file_object = open(output_file, 'w')
    new_fieldnames = [
        'file', 'scan', 'charge', 'sequence', 'score-type', 'score',
        'retention-time'
    ]

    with open(input_file, 'r') as in_file:
        csv_input = csv.DictReader(in_file)
        csv_output = csv.DictWriter(output_file_object,
                                    new_fieldnames,
                                    delimiter='\t',
                                    **csv_kwargs)
        csv_output.writeheader()

        for csv_line_dict in csv_input:
            rt = round(float(csv_line_dict['Retention Time (s)']) / 60, 10)
            sequence = csv_line_dict['Sequence']
            mods = csv_line_dict['Modifications'].split(';')

            pattern = re.compile(r''':(?P<pos>[0-9]*$)''')
            pos2mass = {}
            for mod in mods:
                if mod == '':
                    continue
                if ':' not in mod:
                    sys.exit(
                        'This unimod: {0} requires positional information'.
                        format(mod))
                for occ, match in enumerate(pattern.finditer(mod)):
                    try:
                        unimod_mass = umama.name2mass(mod[:match.start()])
                    except:
                        sys.exit(
                            'Can not map unimod {0}. extracted position argument: {1}'
                            .format(mod, match.start()))
                    position = int(match.group('pos'))
                    if position == 0:
                        position = 1
                    elif position > len(sequence):
                        position = len(sequence)
                    if position not in pos2mass:
                        pos2mass[position] = 0
                    pos2mass[position] += unimod_mass
                    if occ >= 1:
                        sys.exit(
                            'Incorrect regex pattern for mod: {0}'.format(mod))

            seq_incl_mods = ''
            for p, aa in enumerate(sequence):
                seq_incl_mods += aa
                if p + 1 in pos2mass:
                    mass = pos2mass[p + 1]
                    if mass >= 0:
                        seq_incl_mods += '[+{0}]'.format(mass)
                    else:
                        seq_incl_mods += '[{0}]'.format(mass)

            ssl_line_dict = {}
            ssl_line_dict['file'] = csv_line_dict['Raw data location']
            ssl_line_dict['scan'] = csv_line_dict['Spectrum ID']
            ssl_line_dict['charge'] = csv_line_dict['Charge']
            ssl_line_dict['sequence'] = seq_incl_mods
            ssl_line_dict['score-type'] = score_type
            ssl_line_dict['score'] = csv_line_dict[score_column_name]
            ssl_line_dict['retention-time'] = rt

            csv_output.writerow(ssl_line_dict)

    output_file_object.close()
    in_file.close()
    return output_file
示例#6
0
def main(input_file=None,
         output_file=None,
         score_column_name=None,
         score_type=None):
    """
    Convert csvs to ssl
    """

    umama = ursgal.UnimodMapper()

    csv_kwargs = {}
    if sys.platform == "win32":
        csv_kwargs["lineterminator"] = "\n"
    else:
        csv_kwargs["lineterminator"] = "\r\n"

    output_file_object = open(output_file, "w")
    new_fieldnames = [
        "file",
        "scan",
        "charge",
        "sequence",
        "score-type",
        "score",
        "retention-time",
    ]

    with open(input_file, "r") as in_file:
        csv_input = csv.DictReader(in_file)
        csv_output = csv.DictWriter(output_file_object,
                                    new_fieldnames,
                                    delimiter="\t",
                                    **csv_kwargs)
        csv_output.writeheader()

        for csv_line_dict in csv_input:
            rt = round(float(csv_line_dict["Retention Time (s)"]) / 60, 10)
            sequence = csv_line_dict["Sequence"]
            mods = csv_line_dict["Modifications"].split(";")

            pattern = re.compile(r""":(?P<pos>[0-9]*$)""")
            pos2mass = {}
            for mod in mods:
                if mod == "":
                    continue
                if ":" not in mod:
                    sys.exit(
                        "This unimod: {0} requires positional information".
                        format(mod))
                for occ, match in enumerate(pattern.finditer(mod)):
                    try:
                        unimod_mass = umama.name2mass(mod[:match.start()])
                    except:
                        sys.exit(
                            "Can not map unimod {0}. extracted position argument: {1}"
                            .format(mod, match.start()))
                    position = int(match.group("pos"))
                    if position == 0:
                        position = 1
                    elif position > len(sequence):
                        position = len(sequence)
                    if position not in pos2mass:
                        pos2mass[position] = 0
                    pos2mass[position] += unimod_mass
                    if occ >= 1:
                        sys.exit(
                            "Incorrect regex pattern for mod: {0}".format(mod))

            seq_incl_mods = ""
            for p, aa in enumerate(sequence):
                seq_incl_mods += aa
                if p + 1 in pos2mass:
                    mass = pos2mass[p + 1]
                    if mass >= 0:
                        seq_incl_mods += "[+{0}]".format(mass)
                    else:
                        seq_incl_mods += "[{0}]".format(mass)

            ssl_line_dict = {}
            ssl_line_dict["file"] = csv_line_dict["Raw data location"]
            ssl_line_dict["scan"] = csv_line_dict["Spectrum ID"]
            ssl_line_dict["charge"] = csv_line_dict["Charge"]
            ssl_line_dict["sequence"] = seq_incl_mods
            ssl_line_dict["score-type"] = score_type
            ssl_line_dict["score"] = csv_line_dict[score_column_name]
            ssl_line_dict["retention-time"] = rt

            csv_output.writerow(ssl_line_dict)

    output_file_object.close()
    in_file.close()
    return output_file
示例#7
0
    def write_input_tsv(self, input_csv, tmp_dir):
        """
        convert ursgal csv into PTM-Shepherd input tsv
        (same format as Philosopher output psms tsv)
        """
        print("[ PREFLGHT ] writing PTM-Shepherd config file ...")
        ptmshep_input = os.path.join(
            tmp_dir,
            os.path.basename(input_csv).replace(".csv", ".tsv"))
        # mod pattern
        mod_pattern = re.compile(r"""(?P<name>.*):(?P<pos>[0-9]*$)""")

        # convert csv file into PTM-Shepherd input file
        fieldnames_list = [
            "Spectrum",
            "Peptide",
            "Modified Peptide",
            "Peptide Length",
            "Charge",
            "Retention",
            "Calculated Peptide Mass",
            "Calculated M/Z",
            "Delta Mass",
            "Assigned Modifications",
            "Is Unique",
        ]
        umama = ursgal.UnimodMapper()
        scan_rt_lookup_path = os.path.join(
            os.path.dirname(
                self.params["translations"]["mzml_input_files"][0]),
            "_ursgal_lookup.pkl",
        )
        # with open(scan_rt_lookup_path, 'rb') as scan_rt_in:
        #     scan_rt_lookup_dict = pickle.load(scan_rt_in)

        with open(ptmshep_input, "w", newline="") as new_csvfile, open(
                self.params["translations"]["csv_input_file"],
                "r") as csv_file:
            writer = csv.DictWriter(new_csvfile,
                                    fieldnames=fieldnames_list,
                                    delimiter="\t")
            writer.writeheader()
            csv_reader = csv.DictReader(csv_file)
            for n, row in enumerate(csv_reader):
                if n % 500 == 0:
                    print("[ PREFLGHT ] Processing line number: {0}".format(n),
                          end="\r")
                mass_diffs = row["Mass Difference"].split(";")
                mass_diffs_sum = 0.0
                for n, mass in enumerate(mass_diffs):
                    if mass == "":
                        continue
                    if "moda" in row["Search Engine"] and mass.startswith("+"):
                        exp_mass = ursgal.ucore.calculate_mass(
                            float(row["Exp m/z"]), int(row["Charge"]))
                        mass = "{}".format(exp_mass - float(row["uCalc Mass"]))
                    mass_diffs_sum += float(mass.split(":")[0])

                if "<|>" in row["Protein ID"]:
                    is_unique = "false"
                else:
                    is_unique = "true"

                rt = row.get("Retention Time (s)", "")
                if rt == "":
                    spectrum_id = int(row["Spectrum ID"])
                    raw_file_name = os.path.basename(row["Raw data location"])
                    input_file_basename_for_rt_lookup = raw_file_name.replace(
                        ".mgf", "")
                    retention_time_in_minutes = scan_rt_lookup_dict[
                        input_file_basename_for_rt_lookup]["scan_2_rt"][
                            spectrum_id]
                    row["Retention Time (s)"] = retention_time_in_minutes * 60
                assert (row["Retention Time (s)"] != ""), """
                [ERROR] Retention Time needs to be given for each row.
                """

                tmp_dict = {}
                tmp_dict["Spectrum"] = row["Spectrum Title"]
                tmp_dict["Peptide"] = row["Sequence"]
                tmp_dict["Modified Peptide"] = row["Sequence"]
                tmp_dict["Peptide Length"] = len(row["Sequence"])
                tmp_dict["Charge"] = row["Charge"]
                tmp_dict["Retention"] = row["Retention Time (s)"]
                tmp_dict["Calculated Peptide Mass"] = row["uCalc Mass"]
                tmp_dict["Calculated M/Z"] = row["uCalc m/z"]
                tmp_dict["Delta Mass"] = mass_diffs_sum
                tmp_dict["Is Unique"] = is_unique

                mods = row["Modifications"].split(";")
                new_mod_list = []
                for mod in mods:
                    if mod == "":
                        continue
                    match = mod_pattern.search(mod)
                    pos = int(match.group("pos"))
                    if pos == 0:
                        aa = row["Sequence"][pos]
                    else:
                        aa = row["Sequence"][pos - 1]
                    mass = umama.name2mass(match.group("name"))
                    mod_entry = "{0}{1}({2})".format(pos, aa, mass)
                    new_mod_list.append(mod_entry)
                tmp_dict["Assigned Modifications"] = ", ".join(new_mod_list)
                writer.writerow(tmp_dict)
        print("[ PREFLGHT ] Processing done")
        return ptmshep_input