Пример #1
0
    def keep_best_scoring_peptide(self, df):
        '''
        keeping the best scoring pepetide
        1. Using _syn_DF
        2. group by Scan
        3. For each unique Scan,
              Keep row with MSGFDB_SpecEValues
        4. create consolidate_syn_DF
        Note: consolidate_syn_DF
              have unique row for each ResultID, but
              has duplicate Scan due to multiple min MSGFDB_SpecEValue!

        :return:
        '''
        # Slow
        # df1 = df.groupby(['Scan'])['MSGFDB_SpecEValue'].min()
        # df1 = pd.DataFrame(df1, columns = ['MSGFDB_SpecEValue'])
        # df1['Scan'] = df1.index
        # df1.index = range(df1.shape[0])
        # self.consolidate_syn_DF = pd.merge(df,df1, on = ['Scan','MSGFDB_SpecEValue'])
        # print(">>> consolidate_syn_DF shape{}".format(self.consolidate_syn_DF.shape))

        # or # Fast
        self.consolidate_syn_DF = df[df.groupby(
            "Scan")["MSGFDB_SpecEValue"].transform('min') ==
                                     df['MSGFDB_SpecEValue']]
        logger.info(
            "---MERGE:1---KBSP :: 'consolidate_syn_DF' shape: {}".format(
                self.consolidate_syn_DF.shape))
Пример #2
0
    def consolidate_syn_files(self):
        '''
        1. For all jobs Read in(Stack):
           "*msgfplus_syn.txt" in _syn_DF with added JobNum & dataset column

        Note: _syn_DF have duplicate rows for each Scan with MSGFDB_SpecEValue.

        :return:
        '''
        # DMS_MSGFjobs= 'DMS_MSGFjobs'
        nmdc_MSGFjobs = 'nmdc_jobs/SYNOPSIS/'
        msgf_folder = os.path.join(self.parent_folder, nmdc_MSGFjobs)
        self.group_files(msgf_folder)
        # print(self.syn)
        # print(self.protein)
        # print(self.mapper)

        syn_df = self.stack_files(self.syn, self.file_pattern_types["syn"])
        logger.info("---MERGE:0---stack_SYNs :: 'syn_df' shape: {}".format(
            syn_df.shape))
        self.keep_best_scoring_peptide(syn_df)

        # self.improve_FDR(self.consolidate_syn_DF)

        self.get_protein_info()
        self.write_to_disk(syn_df, self.parent_folder, "syn_DF.tsv")
        self.write_to_disk(self.consolidate_syn_DF, self.parent_folder,
                           "consolidate_syn_DF.tsv")
Пример #3
0
    def execute_mzid2tsv(self, raw_basename, folder, logs):
        '''
        3. MzidToTsvConverter | INPUT:(.mzid file)	| OUTPUT:(.tsv file)
        -mzid:path : Path to the .mzid or .mzid.gz file
        -tsv:path : Path to tsv file to be writte
        -unroll : Signifies that results should be unrolled, giving one line per unique peptide/protein combination in each spectrum identification
        -showDecoy : - Signifies that decoy results should be included in the output .tsv file.
                     - Decoy results have protein names that start with XXX_

        :param raw_basename:
        :param folder: msgfplus_output/
        :param logs:
        :return:
        '''
        tsv_file = os.path.join(folder, raw_basename + ".tsv")
        if not os.path.isfile(tsv_file):
            if os.system("mono /app/mzid2tsv/net462/MzidToTsvConverter.exe \
                            -mzid:{} \
                            -tsv:{} \
                            -unroll \
                            -showDecoy | tee -a {}".format(
                    os.path.join(folder, raw_basename + ".mzid"), tsv_file,
                    os.path.join(logs, "3_MzidToTsvConverter.log"))) != 0:
                raise
            #     print("Finished running MzidToTsvConverter")
            # except Exception as mzid2tsv_failed:
            #     raise #mzid2tsv_failed()
        else:
            logger.info("Already exists :: TSV file @:{}".format(folder))
Пример #4
0
    def save_to_disk(self, data, data_path, msgf_job_list: list):
        '''

        :param data: analysis_jobs object
        :param data_path: User-defined storage followed by
                          data/dpkgs/{}/ or
                          data/set_of_Dataset_IDs/{}/ or
                          data/set_of_Jobs/
        :param msgf_job_list: list of DMS_MSGF+ jobs.
        :return:
        '''
        if not os.path.exists(data_path):
            os.makedirs(data_path)
        data.to_csv(data_path + "start_file.csv")
        logger.info(
            msg='@{}: start_file.csv/analysis_jobs_obj shape: {} size:{}'.
            format(data_path, data.shape, sys.getsizeof(data)))

        query = Query.JOB_INFO.format(','.join(
            str(job) for job in msgf_job_list))
        result_set = self.db.run_query(query).fetchall()
        df = pd.DataFrame(result_set)
        self.job_info = df
        df.to_csv(data_path + "job_query_info.csv")
        logger.info(msg='@{}: job_query_info.csv shape: {}'.format(
            data_path, df.shape))
Пример #5
0
    def execute_tsv2syn(self, raw_basename, input, final_out, revCatfasta,
                        logs):
        '''
        4. TsvToSynConverter| INPUT:(.tsv file) | OUTPUT:(_syn.txt file)

        -I:InputFilePath : MSGF+ results file (_msgfplus.tsv or _msgfdb.tsv or .tsv)
        -O:OutputDirectoryPath
        -M:ModificationDefinitionFilePath
`       -T:MassCorrectionTagsFilePath
        -N:SearchToolParameterFilePath
        -SynPvalue:
        -SynProb:
        -L:LogFilePath
        -ProteinMods to indicate that the _ProteinMods.txt file should be created.
        -F:FastaFilePath]

        :param raw_basename:
        :param input:       msgfplus_output/
        :param final_out:   nmdc_jobs/SYNOPSIS/ [SEQUEST Synopsis/First Hits files]
        :param revCatfasta: Generated by MSGF+
        :param logs:
        :return:
        '''
        msgfplus_params = MASIC_PARAM_FILE
        msgfplus_ModDef_params = MSGFPLUS_MODEF_PARAM_FILE
        mc_params = MASS_CORRECTION_PARAM
        synopsis = os.path.join(final_out, "SYNOPSIS")
        if not os.path.exists(synopsis):
            os.makedirs(synopsis)

        output_file = os.path.join(synopsis, raw_basename + "_syn.txt")
        if not os.path.isfile(output_file):
            if os.system("mono /app/phrp/PeptideHitResultsProcRunner.exe \
                            -I:{} \
                            -O:{} \
                            -M:{} \
                            -T:{} \
                            -N:{} \
                            -SynPvalue:0.2 -SynProb:0.05 \
                            -L:{} \
                            -ProteinMods \
                            -F:{} \
                            | tee -a {}".format(
                    os.path.join(input, raw_basename + ".tsv"), synopsis,
                    msgfplus_ModDef_params, mc_params, msgfplus_params,
                    os.path.join(
                        logs, "4_TsvToSynConverter.commandlog"), revCatfasta,
                    os.path.join(logs, "4_TsvToSynConverter.log"))) != 0:
                raise
            #     print("Finished running TsvToSynConverter")
            # except Exception as tsv2syn_failed:
            #     raise #tsv2syn_failed()
        else:
            logger.info("Already exists :: SYN files @:{}".format(synopsis))
Пример #6
0
 def write_to_disk(self, url: str):
     '''
     :param url: Job's file path on DMS.
     :return:
     '''
     if not os.path.isfile(url.split('/')[-1]):
         try:
             os.system('wget %s' % url)
             # logging.info("Files transferred successfully!")
         except Exception as e:
             logger.info("FAILED to download file!")
Пример #7
0
    def get_protein_info(self):
        '''
        1. For all jobs Read in(Stack):
            "*ResultToSeqMap.txt"  in ResultToSeqMap_DF with added JobNum column
            "*SeqToProteinMap.txt" in SeqToProteinMap_DF with added JobNum column
        2. Inner-join:
                consolidate_syn   and
                ResultToSeqMap_DF and
                SeqToProteinMap_DF
            over
             1. JobNum   <--> JobNum
                ResultID <--> ResultID
             2. JobNum          <--> JobNum
                Unique_Seq_ID   <--> Unique_Seq_ID
                Protein         <--> Protein_Name
        3. create MSGFjobs_Merged dataframe.

        :return:
        '''
        #FIXME: Confirm with @matt: Could ResultID col in _syn file be duplicate?

        protein_df = self.stack_files(self.protein,
                                      self.file_pattern_types["protein"])
        del protein_df['Dataset']
        protein_df = protein_df.rename(columns={'Protein_Name': 'Protein'})

        mapper_df = self.stack_files(self.mapper,
                                     self.file_pattern_types["mapper"])
        del mapper_df['Dataset']
        mapper_df = mapper_df.rename(columns={'Result_ID': 'ResultID'})

        # TODO: Change  self.consolidate_syn_DF --> self.recomupted_consolidate_syn
        merge1 = pd.merge(self.consolidate_syn_DF,
                          mapper_df,
                          how='left',
                          left_on=['JobNum', 'ResultID'],
                          right_on=['JobNum', 'ResultID'])
        df_with_holes = pd.merge(
            merge1,
            protein_df,
            how='left',
            left_on=['JobNum', 'Unique_Seq_ID', 'Protein'],
            right_on=['JobNum', 'Unique_Seq_ID', 'Protein'])
        self.MSGFjobs_Merged = df_with_holes
        # FIXME: Bug found! No seq_Id found for result_id, replicate the seq_ID
        # Group by identify the
        # self.tackle_Unique_Seq_ID_holes_(df_with_holes)

        self.write_to_disk(self.MSGFjobs_Merged, self.parent_folder,
                           "MSGFjobs_Merged.tsv")
        logger.info(
            "---MERGE:2---PPMerge :: 'MSGFjobs_Merged' shape: {}".format(
                self.MSGFjobs_Merged.shape))
Пример #8
0
    def merge_all_jobs_in_UserInput(self):
        '''
        1. Run for each dataset.
        2. Merge all MSGFjobs_MASIC_resultant objects.
        :return:
        '''
        if not os.path.exists(self.dataset_result_folder):
            # stop =0
            datasets = next(os.walk(self.parent_folder))[1]
            for dataset in datasets:
                if dataset != "DMS_fasta_param":
                    logger.info(
                        "|Merging-------------------------Dataset:{}-----------------------"
                        .format(dataset))
                    dataset_loc = self.parent_folder + dataset + '/'
                    # print("dataset_loc >> ", dataset_loc)

                    # enable switcher --PipeLineMode:: NMDC/ PNNL
                    # DMS_MSGFjobs= 'DMS_MSGFjobs'
                    # nmdc_MSGFjobs = 'nmdc_jobs/SYNOPSIS/
                    # DMS_MASICjob= 'DMS_MASICjob'
                    # nmdc_MSGFjobs = 'nmdc_jobs/SIC/'

                    msfg_obj = MSGFplusMerger(dataset_loc)
                    msfg_obj.consolidate_syn_files()

                    masic = MASICmerger(dataset_loc)
                    masic.merge_msgfplus_msaic(msfg_obj.MSGFjobs_Merged)
                    if self.combineDatasets:
                        self.resultants.append(masic.MSGFjobs_MASIC_resultant)
                    # if stop==1:
                    #     break

            logger.info(msg="````````")
            logger.info(
                msg="Finished aggregating analysis tools results at loc:{}".
                format(self.dataset_result_folder))
            logger.info(msg="````````")
            if self.combineDatasets:
                # concatenate all datasets
                # print("self.combineDatasets >>", self.combineDatasets)
                self.resultants_df = pd.concat(self.resultants)
                # print("self.dataset_result_folder >> ", self.dataset_result_folder)
                self.write_to_disk(self.resultants_df,
                                   self.dataset_result_folder,
                                   "resultants_df.tsv")

        logger.info(
            "Already ran Pipeline, Merged jobs exists at @:{}! please delete them & rerun the pipeline!"
            .format(self.dataset_result_folder))
        return self.dataset_result_folder
Пример #9
0
 def convert_fasta_to_txt(self, fasta):
     converted_fasta = os.path.splitext(fasta)[0] + ".txt"
     if not os.path.isfile(converted_fasta):
         if os.system(
                 "wine /app/ProteinDigestionSimulator/ProteinDigestionSimulator.exe \
                             -I:{} \
                             -O:{} \
                             -F    \
                             | tee -a {}".format(
                     fasta,
                     os.path.split(fasta)[0],
                     os.path.join(
                         os.path.split(fasta)[0],
                         "ProteinDigestionSimulator.log"))) != 0:
             raise
     else:
         logger.info("Already exists :: fasta converted files @:{}".format(
             converted_fasta))
Пример #10
0
 def open_connection(self):
     '''Connection to DMS MS SQLserver.
     '''
     try:
         if self.conn is None:
             self.conn = pymssql.connect(server=self.SERVER,
                                         user=self.USER,
                                         password=self.PASSWORD,
                                         database=self.DATABASE_NAME)
             logger.info(msg="CONNECTION: {}".format(self.conn))
     except pymssql.MySQLError as conn_err:
         logger.error(msg="SQLserver connection FAILD\n{}".format(
             self.SERVER, conn_err))
         # sys.exit()
     finally:
         logger.info(
             "SQL connection to {}:{}:{} opened successfully!".format(
                 self.SERVER, self.DATABASE_NAME, self.USER))
Пример #11
0
    def execute_msgfplus(self, raw_basename, input, output, fasta, logs):
        '''
        2. MSGFPlus | INPUT:(.mzML file and .fasta file) | OUTPUT:(	.mzid file)
         JVM runs with fixed available memory. Once this memory is exceeded you will receive "java.lang.OutOfMemoryError".
         you can set: https://www.baeldung.com/jvm-parameters

        [-s SpectrumFile] (*.mzML, *.mzXML, *.mgf, *.ms2, *.pkl or *_dta.txt)
        [-o OutputFile]  (*.mzid)
        [-d DatabaseFile] (*.fasta or *.fa or *.faa)
        [-thread NumThreads] (Number of concurrent threads to be executed, Default: Number of available cores)
        [-conf ConfigurationFile]
        [-verbose 0/1] (0: Report total progress only (Default), 1: Report total and per-thread progress/status)
        :param raw_basename:
        :param input:  msgfplus_input/
        :param output: msgfplus_output/
        :param fasta:
        :param logs:
        :return:
        '''
        msgfplus_params = MSGFPLUS_PARAM_FILE
        # FIXME: revCAT location! since, multiple datasets use same FASTA, so they can't generate file in FASTA folder!
        # print('???',input)
        # print('???',raw_basename)
        # if not os.path.exists(output):
        output_file = os.path.join(output, raw_basename + ".mzid")
        if not os.path.isfile(output_file):
            if os.system("java -Xmx32G -jar /app/msgf/MSGFPlus.jar \
                            -s {} \
                            -o {} \
                            -d {} \
                            -thread 16 \
                            -conf {} \
                            -verbose 1 | tee -a {}".format(
                    os.path.join(input, raw_basename + ".mzML"), output_file,
                    fasta, msgfplus_params, os.path.join(
                        logs, "2_MSGFPlus.log"))) != 0:
                raise
            #     print("Finished running msgfplus")
            # except Exception as msgfplus_failed:
            #     raise #msgfplus_failed()
        else:
            logger.info("Already exists :: mzid file @:{}".format(output))
Пример #12
0
 def get_files(self):
     '''
     :return:
     '''
     # TODO: Check if .raw files exisits in subdirs!: if not os.path.exists(self.started_from):
     # FIXME: Enabling above stmt, doesn't allow to download datasets when they aren't on the disk!
     os.chdir(self.started_from)
     logger.info(msg="get_files()  @:".format(os.getcwd()))
     self.download_fasta_param_files()
     self.Input.apply(lambda x: self.use_df(x), axis=1)
     logger.info(msg="````````")
     logger.info(msg="Finished downloading data at loc:{}".format(
         self.started_from))
     logger.info(msg="````````")
Пример #13
0
    def execute_masic(self, raw_file, raw_basename, final_out, logs):
        '''
         [/I:InputFilePath
         [/O:OutputDirectoryPath] nmdc_jobs/SIC/
         [/P:ParamFilePath]
         [/SF:StatusFileName]
         [/L:[LogFilePath]]
        :param raw_file:
        :param final_out:
        :param logs:
        :return:
        '''

        # MASIC | INPUT:(Thermo .Raw file) | OUTPUT:(_SICStats.txt)
        masic_params = MASIC_PARAM_FILE
        sic = os.path.join(final_out, "SIC")
        if not os.path.exists(sic):
            os.makedirs(sic)
        output_file = os.path.join(sic, raw_basename + "_ScanStats.txt")
        if not os.path.isfile(output_file):
            if os.system("mono /app/masic/MASIC_Console.exe \
                            /I:{} \
                            /O:{} \
                            /P:{} \
                            /SF:{} \
                            /L:{} | tee -a {}".format(
                    raw_file, sic, masic_params,
                    os.path.join(logs, "MasicStatus.xml"),
                    os.path.join(logs, "0_masic.commandlog"),
                    os.path.join(logs, "0_masic.log"))) != 0:
                raise
                # print("Finished running masic")
            # except:
            #     logger.error('MASIC failed for {}-dataset: || RAW: \n {}'.format(STUDY, dataset_id, raw_basename, exc_info=e))
            # except Exception as masic_failed:
            #     raise #masic_error() from masic_failed
        else:
            logger.info("Already exists :: SIC files @:{}".format(sic))
Пример #14
0
    def execute_msconvert(self, raw_file, raw_basename, output, logs):
        '''
        1. MSconvert in pwiz | INPUT:(Thermo .raw file) | OUTPUT:(.mzML file)

        -z [ --zlib ] : use zlib compression for binary data
        --filter arg : add a spectrum list filter
                       peakPicking [<PickerType> [snr=<minimum signal-to-noise ratio>] [peakSpace=<minimum peak spacing>] [msLevel=<ms_levels>]]
                                 : This filter performs centroiding on spectra with the selected <ms_levels>, expressed as an int_set. The value for <PickerType> must be "cwt" or "vendor": when <PickerType> = "vendor", vendor (Windows DLL) code is used if available.
        -o [ --outdir ] arg (=.) : set output directory ('-' for stdout) [.]
        -v [ --verbose ] : display detailed progress information

        --mzML : write mzML format [default]

        :param raw_file:
        :param output: msgfplus_input/
        :param logs:
        :return:
        '''
        # TODO: Only run if mzML doesn't exist
        # if not os.path.exists(output):
        output_file = os.path.join(output, raw_basename + ".mzML")
        if not os.path.isfile(output_file):
            if os.system("wine msconvert \
                    {} \
                    --zlib \
                    --filter 'peakPicking true 2-' \
                    -o {}\
                    --verbose | tee -a {}".format(
                    raw_file, output, os.path.join(logs,
                                                   "1_MSconvert.log"))) != 0:
                raise
            #     print("Finished running msconvert")
            # except Exception as msconvert_failed:
            #     raise #msconvert_failed()
        else:
            logger.info("Already exists :: mzML file @:{}".format(output))
Пример #15
0
    def process_datasets(self, data_path):
        '''
        Run dataset through packages.
        make .raw and .fasta files available for them.
        :param data_path:
        :return:
        '''
        file_map = os.path.join(data_path, 'emsl_to_jgi_stegen01152021.json')

        if os.path.isfile(file_map):
            for path, subdirs, files in os.walk(data_path):
                for file in files:
                    if fnmatch.fnmatch(file, '*.raw'):
                        raw_file = os.path.join(path, file)

                        dataset_id = path.split("/")[-1]
                        dataset_faa = self.get_fasta_loc(file_map, dataset_id)
                        if dataset_faa is None:
                            # Can't find an entry in .json.!
                            logger.info(
                                "|---------------FASTA is not available for {}:{} in emsl_to_jgi.json file!---------------"
                                .format(STUDY, dataset_id))
                            continue
                        print('??dataset_faa', dataset_faa)

                        revCat_faa = dataset_faa.replace("faa", "revCat.fasta")
                        print('??revCat_faa', revCat_faa)

                        raw_basename = os.path.basename(
                            os.path.splitext(raw_file)[0])

                        print('``', raw_basename)
                        print('``', raw_file)
                        logs = os.path.join(path, "logs")
                        if not os.path.exists(logs):
                            os.makedirs(logs)

                        ip = os.path.join(path, "msgfplus_input")
                        out = os.path.join(path, "msgfplus_output")
                        nmdc_out = os.path.join(path, "nmdc_jobs")

                        print(
                            "processing--------------------------------{}:{}--------------------------------"
                            .format(STUDY, dataset_id))
                        # print("Start SELECTED ION CHROMATOGRAMS FILE generation")
                        try:
                            self.execute_masic(raw_file, raw_basename,
                                               nmdc_out, logs)
                        except Exception as e:
                            logger.error(
                                'MASIC failed for {}-dataset: || RAW: \n {}'.
                                format(STUDY,
                                       dataset_id,
                                       raw_basename,
                                       exc_info=e))
                        # print("END SELECTED ION CHROMATOGRAMS FILE generation")
                        # print('*' * 30)
                        # print("Start SYNOPSIS/FIRST-HITS FILE generation")

                        try:
                            self.execute_msconvert(raw_file, raw_basename, ip,
                                                   logs)
                            try:
                                self.execute_msgfplus(raw_basename, ip, out,
                                                      dataset_faa, logs)
                                try:
                                    self.execute_mzid2tsv(
                                        raw_basename, out, logs)
                                    try:
                                        self.execute_tsv2syn(
                                            raw_basename, out, nmdc_out,
                                            revCat_faa, logs)
                                        logger.info(
                                            "|--------------------------------{}:{}--------------------------------"
                                            .format(STUDY, dataset_id))
                                    except Exception as e:
                                        logger.error(
                                            'TsV2SyN failed for {}-dataset: || RAW: \n {}'
                                            .format(STUDY,
                                                    dataset_id,
                                                    raw_basename,
                                                    exc_info=e))
                                except Exception as e:
                                    logger.error(
                                        'MzID2TsV failed for {}-dataset: || RAW: \n {}'
                                        .format(STUDY,
                                                dataset_id,
                                                raw_basename,
                                                exc_info=e))
                            except Exception as e:
                                logger.error(
                                    'MSGFPLUS failed for {}-dataset: || RAW: \n {}'
                                    .format(STUDY,
                                            dataset_id,
                                            raw_basename,
                                            exc_info=e))
                        except Exception as e:
                            logger.error(
                                'MSCONVERT failed for {-/dataset: || RAW: \n {}'
                                .format(STUDY,
                                        dataset_id,
                                        raw_basename,
                                        exc_info=e))
                        print("End SYNOPSIS/FIRST-HITS FILE generation")
        else:
            print("Can't run without emsl_to_jgi.json? ")