def keep_best_scoring_peptide(self, df): ''' keeping the best scoring pepetide 1. Using _syn_DF 2. group by Scan 3. For each unique Scan, Keep row with MSGFDB_SpecEValues 4. create consolidate_syn_DF Note: consolidate_syn_DF have unique row for each ResultID, but has duplicate Scan due to multiple min MSGFDB_SpecEValue! :return: ''' # Slow # df1 = df.groupby(['Scan'])['MSGFDB_SpecEValue'].min() # df1 = pd.DataFrame(df1, columns = ['MSGFDB_SpecEValue']) # df1['Scan'] = df1.index # df1.index = range(df1.shape[0]) # self.consolidate_syn_DF = pd.merge(df,df1, on = ['Scan','MSGFDB_SpecEValue']) # print(">>> consolidate_syn_DF shape{}".format(self.consolidate_syn_DF.shape)) # or # Fast self.consolidate_syn_DF = df[df.groupby( "Scan")["MSGFDB_SpecEValue"].transform('min') == df['MSGFDB_SpecEValue']] logger.info( "---MERGE:1---KBSP :: 'consolidate_syn_DF' shape: {}".format( self.consolidate_syn_DF.shape))
def consolidate_syn_files(self): ''' 1. For all jobs Read in(Stack): "*msgfplus_syn.txt" in _syn_DF with added JobNum & dataset column Note: _syn_DF have duplicate rows for each Scan with MSGFDB_SpecEValue. :return: ''' # DMS_MSGFjobs= 'DMS_MSGFjobs' nmdc_MSGFjobs = 'nmdc_jobs/SYNOPSIS/' msgf_folder = os.path.join(self.parent_folder, nmdc_MSGFjobs) self.group_files(msgf_folder) # print(self.syn) # print(self.protein) # print(self.mapper) syn_df = self.stack_files(self.syn, self.file_pattern_types["syn"]) logger.info("---MERGE:0---stack_SYNs :: 'syn_df' shape: {}".format( syn_df.shape)) self.keep_best_scoring_peptide(syn_df) # self.improve_FDR(self.consolidate_syn_DF) self.get_protein_info() self.write_to_disk(syn_df, self.parent_folder, "syn_DF.tsv") self.write_to_disk(self.consolidate_syn_DF, self.parent_folder, "consolidate_syn_DF.tsv")
def execute_mzid2tsv(self, raw_basename, folder, logs): ''' 3. MzidToTsvConverter | INPUT:(.mzid file) | OUTPUT:(.tsv file) -mzid:path : Path to the .mzid or .mzid.gz file -tsv:path : Path to tsv file to be writte -unroll : Signifies that results should be unrolled, giving one line per unique peptide/protein combination in each spectrum identification -showDecoy : - Signifies that decoy results should be included in the output .tsv file. - Decoy results have protein names that start with XXX_ :param raw_basename: :param folder: msgfplus_output/ :param logs: :return: ''' tsv_file = os.path.join(folder, raw_basename + ".tsv") if not os.path.isfile(tsv_file): if os.system("mono /app/mzid2tsv/net462/MzidToTsvConverter.exe \ -mzid:{} \ -tsv:{} \ -unroll \ -showDecoy | tee -a {}".format( os.path.join(folder, raw_basename + ".mzid"), tsv_file, os.path.join(logs, "3_MzidToTsvConverter.log"))) != 0: raise # print("Finished running MzidToTsvConverter") # except Exception as mzid2tsv_failed: # raise #mzid2tsv_failed() else: logger.info("Already exists :: TSV file @:{}".format(folder))
def save_to_disk(self, data, data_path, msgf_job_list: list): ''' :param data: analysis_jobs object :param data_path: User-defined storage followed by data/dpkgs/{}/ or data/set_of_Dataset_IDs/{}/ or data/set_of_Jobs/ :param msgf_job_list: list of DMS_MSGF+ jobs. :return: ''' if not os.path.exists(data_path): os.makedirs(data_path) data.to_csv(data_path + "start_file.csv") logger.info( msg='@{}: start_file.csv/analysis_jobs_obj shape: {} size:{}'. format(data_path, data.shape, sys.getsizeof(data))) query = Query.JOB_INFO.format(','.join( str(job) for job in msgf_job_list)) result_set = self.db.run_query(query).fetchall() df = pd.DataFrame(result_set) self.job_info = df df.to_csv(data_path + "job_query_info.csv") logger.info(msg='@{}: job_query_info.csv shape: {}'.format( data_path, df.shape))
def execute_tsv2syn(self, raw_basename, input, final_out, revCatfasta, logs): ''' 4. TsvToSynConverter| INPUT:(.tsv file) | OUTPUT:(_syn.txt file) -I:InputFilePath : MSGF+ results file (_msgfplus.tsv or _msgfdb.tsv or .tsv) -O:OutputDirectoryPath -M:ModificationDefinitionFilePath ` -T:MassCorrectionTagsFilePath -N:SearchToolParameterFilePath -SynPvalue: -SynProb: -L:LogFilePath -ProteinMods to indicate that the _ProteinMods.txt file should be created. -F:FastaFilePath] :param raw_basename: :param input: msgfplus_output/ :param final_out: nmdc_jobs/SYNOPSIS/ [SEQUEST Synopsis/First Hits files] :param revCatfasta: Generated by MSGF+ :param logs: :return: ''' msgfplus_params = MASIC_PARAM_FILE msgfplus_ModDef_params = MSGFPLUS_MODEF_PARAM_FILE mc_params = MASS_CORRECTION_PARAM synopsis = os.path.join(final_out, "SYNOPSIS") if not os.path.exists(synopsis): os.makedirs(synopsis) output_file = os.path.join(synopsis, raw_basename + "_syn.txt") if not os.path.isfile(output_file): if os.system("mono /app/phrp/PeptideHitResultsProcRunner.exe \ -I:{} \ -O:{} \ -M:{} \ -T:{} \ -N:{} \ -SynPvalue:0.2 -SynProb:0.05 \ -L:{} \ -ProteinMods \ -F:{} \ | tee -a {}".format( os.path.join(input, raw_basename + ".tsv"), synopsis, msgfplus_ModDef_params, mc_params, msgfplus_params, os.path.join( logs, "4_TsvToSynConverter.commandlog"), revCatfasta, os.path.join(logs, "4_TsvToSynConverter.log"))) != 0: raise # print("Finished running TsvToSynConverter") # except Exception as tsv2syn_failed: # raise #tsv2syn_failed() else: logger.info("Already exists :: SYN files @:{}".format(synopsis))
def write_to_disk(self, url: str): ''' :param url: Job's file path on DMS. :return: ''' if not os.path.isfile(url.split('/')[-1]): try: os.system('wget %s' % url) # logging.info("Files transferred successfully!") except Exception as e: logger.info("FAILED to download file!")
def get_protein_info(self): ''' 1. For all jobs Read in(Stack): "*ResultToSeqMap.txt" in ResultToSeqMap_DF with added JobNum column "*SeqToProteinMap.txt" in SeqToProteinMap_DF with added JobNum column 2. Inner-join: consolidate_syn and ResultToSeqMap_DF and SeqToProteinMap_DF over 1. JobNum <--> JobNum ResultID <--> ResultID 2. JobNum <--> JobNum Unique_Seq_ID <--> Unique_Seq_ID Protein <--> Protein_Name 3. create MSGFjobs_Merged dataframe. :return: ''' #FIXME: Confirm with @matt: Could ResultID col in _syn file be duplicate? protein_df = self.stack_files(self.protein, self.file_pattern_types["protein"]) del protein_df['Dataset'] protein_df = protein_df.rename(columns={'Protein_Name': 'Protein'}) mapper_df = self.stack_files(self.mapper, self.file_pattern_types["mapper"]) del mapper_df['Dataset'] mapper_df = mapper_df.rename(columns={'Result_ID': 'ResultID'}) # TODO: Change self.consolidate_syn_DF --> self.recomupted_consolidate_syn merge1 = pd.merge(self.consolidate_syn_DF, mapper_df, how='left', left_on=['JobNum', 'ResultID'], right_on=['JobNum', 'ResultID']) df_with_holes = pd.merge( merge1, protein_df, how='left', left_on=['JobNum', 'Unique_Seq_ID', 'Protein'], right_on=['JobNum', 'Unique_Seq_ID', 'Protein']) self.MSGFjobs_Merged = df_with_holes # FIXME: Bug found! No seq_Id found for result_id, replicate the seq_ID # Group by identify the # self.tackle_Unique_Seq_ID_holes_(df_with_holes) self.write_to_disk(self.MSGFjobs_Merged, self.parent_folder, "MSGFjobs_Merged.tsv") logger.info( "---MERGE:2---PPMerge :: 'MSGFjobs_Merged' shape: {}".format( self.MSGFjobs_Merged.shape))
def merge_all_jobs_in_UserInput(self): ''' 1. Run for each dataset. 2. Merge all MSGFjobs_MASIC_resultant objects. :return: ''' if not os.path.exists(self.dataset_result_folder): # stop =0 datasets = next(os.walk(self.parent_folder))[1] for dataset in datasets: if dataset != "DMS_fasta_param": logger.info( "|Merging-------------------------Dataset:{}-----------------------" .format(dataset)) dataset_loc = self.parent_folder + dataset + '/' # print("dataset_loc >> ", dataset_loc) # enable switcher --PipeLineMode:: NMDC/ PNNL # DMS_MSGFjobs= 'DMS_MSGFjobs' # nmdc_MSGFjobs = 'nmdc_jobs/SYNOPSIS/ # DMS_MASICjob= 'DMS_MASICjob' # nmdc_MSGFjobs = 'nmdc_jobs/SIC/' msfg_obj = MSGFplusMerger(dataset_loc) msfg_obj.consolidate_syn_files() masic = MASICmerger(dataset_loc) masic.merge_msgfplus_msaic(msfg_obj.MSGFjobs_Merged) if self.combineDatasets: self.resultants.append(masic.MSGFjobs_MASIC_resultant) # if stop==1: # break logger.info(msg="````````") logger.info( msg="Finished aggregating analysis tools results at loc:{}". format(self.dataset_result_folder)) logger.info(msg="````````") if self.combineDatasets: # concatenate all datasets # print("self.combineDatasets >>", self.combineDatasets) self.resultants_df = pd.concat(self.resultants) # print("self.dataset_result_folder >> ", self.dataset_result_folder) self.write_to_disk(self.resultants_df, self.dataset_result_folder, "resultants_df.tsv") logger.info( "Already ran Pipeline, Merged jobs exists at @:{}! please delete them & rerun the pipeline!" .format(self.dataset_result_folder)) return self.dataset_result_folder
def convert_fasta_to_txt(self, fasta): converted_fasta = os.path.splitext(fasta)[0] + ".txt" if not os.path.isfile(converted_fasta): if os.system( "wine /app/ProteinDigestionSimulator/ProteinDigestionSimulator.exe \ -I:{} \ -O:{} \ -F \ | tee -a {}".format( fasta, os.path.split(fasta)[0], os.path.join( os.path.split(fasta)[0], "ProteinDigestionSimulator.log"))) != 0: raise else: logger.info("Already exists :: fasta converted files @:{}".format( converted_fasta))
def open_connection(self): '''Connection to DMS MS SQLserver. ''' try: if self.conn is None: self.conn = pymssql.connect(server=self.SERVER, user=self.USER, password=self.PASSWORD, database=self.DATABASE_NAME) logger.info(msg="CONNECTION: {}".format(self.conn)) except pymssql.MySQLError as conn_err: logger.error(msg="SQLserver connection FAILD\n{}".format( self.SERVER, conn_err)) # sys.exit() finally: logger.info( "SQL connection to {}:{}:{} opened successfully!".format( self.SERVER, self.DATABASE_NAME, self.USER))
def execute_msgfplus(self, raw_basename, input, output, fasta, logs): ''' 2. MSGFPlus | INPUT:(.mzML file and .fasta file) | OUTPUT:( .mzid file) JVM runs with fixed available memory. Once this memory is exceeded you will receive "java.lang.OutOfMemoryError". you can set: https://www.baeldung.com/jvm-parameters [-s SpectrumFile] (*.mzML, *.mzXML, *.mgf, *.ms2, *.pkl or *_dta.txt) [-o OutputFile] (*.mzid) [-d DatabaseFile] (*.fasta or *.fa or *.faa) [-thread NumThreads] (Number of concurrent threads to be executed, Default: Number of available cores) [-conf ConfigurationFile] [-verbose 0/1] (0: Report total progress only (Default), 1: Report total and per-thread progress/status) :param raw_basename: :param input: msgfplus_input/ :param output: msgfplus_output/ :param fasta: :param logs: :return: ''' msgfplus_params = MSGFPLUS_PARAM_FILE # FIXME: revCAT location! since, multiple datasets use same FASTA, so they can't generate file in FASTA folder! # print('???',input) # print('???',raw_basename) # if not os.path.exists(output): output_file = os.path.join(output, raw_basename + ".mzid") if not os.path.isfile(output_file): if os.system("java -Xmx32G -jar /app/msgf/MSGFPlus.jar \ -s {} \ -o {} \ -d {} \ -thread 16 \ -conf {} \ -verbose 1 | tee -a {}".format( os.path.join(input, raw_basename + ".mzML"), output_file, fasta, msgfplus_params, os.path.join( logs, "2_MSGFPlus.log"))) != 0: raise # print("Finished running msgfplus") # except Exception as msgfplus_failed: # raise #msgfplus_failed() else: logger.info("Already exists :: mzid file @:{}".format(output))
def get_files(self): ''' :return: ''' # TODO: Check if .raw files exisits in subdirs!: if not os.path.exists(self.started_from): # FIXME: Enabling above stmt, doesn't allow to download datasets when they aren't on the disk! os.chdir(self.started_from) logger.info(msg="get_files() @:".format(os.getcwd())) self.download_fasta_param_files() self.Input.apply(lambda x: self.use_df(x), axis=1) logger.info(msg="````````") logger.info(msg="Finished downloading data at loc:{}".format( self.started_from)) logger.info(msg="````````")
def execute_masic(self, raw_file, raw_basename, final_out, logs): ''' [/I:InputFilePath [/O:OutputDirectoryPath] nmdc_jobs/SIC/ [/P:ParamFilePath] [/SF:StatusFileName] [/L:[LogFilePath]] :param raw_file: :param final_out: :param logs: :return: ''' # MASIC | INPUT:(Thermo .Raw file) | OUTPUT:(_SICStats.txt) masic_params = MASIC_PARAM_FILE sic = os.path.join(final_out, "SIC") if not os.path.exists(sic): os.makedirs(sic) output_file = os.path.join(sic, raw_basename + "_ScanStats.txt") if not os.path.isfile(output_file): if os.system("mono /app/masic/MASIC_Console.exe \ /I:{} \ /O:{} \ /P:{} \ /SF:{} \ /L:{} | tee -a {}".format( raw_file, sic, masic_params, os.path.join(logs, "MasicStatus.xml"), os.path.join(logs, "0_masic.commandlog"), os.path.join(logs, "0_masic.log"))) != 0: raise # print("Finished running masic") # except: # logger.error('MASIC failed for {}-dataset: || RAW: \n {}'.format(STUDY, dataset_id, raw_basename, exc_info=e)) # except Exception as masic_failed: # raise #masic_error() from masic_failed else: logger.info("Already exists :: SIC files @:{}".format(sic))
def execute_msconvert(self, raw_file, raw_basename, output, logs): ''' 1. MSconvert in pwiz | INPUT:(Thermo .raw file) | OUTPUT:(.mzML file) -z [ --zlib ] : use zlib compression for binary data --filter arg : add a spectrum list filter peakPicking [<PickerType> [snr=<minimum signal-to-noise ratio>] [peakSpace=<minimum peak spacing>] [msLevel=<ms_levels>]] : This filter performs centroiding on spectra with the selected <ms_levels>, expressed as an int_set. The value for <PickerType> must be "cwt" or "vendor": when <PickerType> = "vendor", vendor (Windows DLL) code is used if available. -o [ --outdir ] arg (=.) : set output directory ('-' for stdout) [.] -v [ --verbose ] : display detailed progress information --mzML : write mzML format [default] :param raw_file: :param output: msgfplus_input/ :param logs: :return: ''' # TODO: Only run if mzML doesn't exist # if not os.path.exists(output): output_file = os.path.join(output, raw_basename + ".mzML") if not os.path.isfile(output_file): if os.system("wine msconvert \ {} \ --zlib \ --filter 'peakPicking true 2-' \ -o {}\ --verbose | tee -a {}".format( raw_file, output, os.path.join(logs, "1_MSconvert.log"))) != 0: raise # print("Finished running msconvert") # except Exception as msconvert_failed: # raise #msconvert_failed() else: logger.info("Already exists :: mzML file @:{}".format(output))
def process_datasets(self, data_path): ''' Run dataset through packages. make .raw and .fasta files available for them. :param data_path: :return: ''' file_map = os.path.join(data_path, 'emsl_to_jgi_stegen01152021.json') if os.path.isfile(file_map): for path, subdirs, files in os.walk(data_path): for file in files: if fnmatch.fnmatch(file, '*.raw'): raw_file = os.path.join(path, file) dataset_id = path.split("/")[-1] dataset_faa = self.get_fasta_loc(file_map, dataset_id) if dataset_faa is None: # Can't find an entry in .json.! logger.info( "|---------------FASTA is not available for {}:{} in emsl_to_jgi.json file!---------------" .format(STUDY, dataset_id)) continue print('??dataset_faa', dataset_faa) revCat_faa = dataset_faa.replace("faa", "revCat.fasta") print('??revCat_faa', revCat_faa) raw_basename = os.path.basename( os.path.splitext(raw_file)[0]) print('``', raw_basename) print('``', raw_file) logs = os.path.join(path, "logs") if not os.path.exists(logs): os.makedirs(logs) ip = os.path.join(path, "msgfplus_input") out = os.path.join(path, "msgfplus_output") nmdc_out = os.path.join(path, "nmdc_jobs") print( "processing--------------------------------{}:{}--------------------------------" .format(STUDY, dataset_id)) # print("Start SELECTED ION CHROMATOGRAMS FILE generation") try: self.execute_masic(raw_file, raw_basename, nmdc_out, logs) except Exception as e: logger.error( 'MASIC failed for {}-dataset: || RAW: \n {}'. format(STUDY, dataset_id, raw_basename, exc_info=e)) # print("END SELECTED ION CHROMATOGRAMS FILE generation") # print('*' * 30) # print("Start SYNOPSIS/FIRST-HITS FILE generation") try: self.execute_msconvert(raw_file, raw_basename, ip, logs) try: self.execute_msgfplus(raw_basename, ip, out, dataset_faa, logs) try: self.execute_mzid2tsv( raw_basename, out, logs) try: self.execute_tsv2syn( raw_basename, out, nmdc_out, revCat_faa, logs) logger.info( "|--------------------------------{}:{}--------------------------------" .format(STUDY, dataset_id)) except Exception as e: logger.error( 'TsV2SyN failed for {}-dataset: || RAW: \n {}' .format(STUDY, dataset_id, raw_basename, exc_info=e)) except Exception as e: logger.error( 'MzID2TsV failed for {}-dataset: || RAW: \n {}' .format(STUDY, dataset_id, raw_basename, exc_info=e)) except Exception as e: logger.error( 'MSGFPLUS failed for {}-dataset: || RAW: \n {}' .format(STUDY, dataset_id, raw_basename, exc_info=e)) except Exception as e: logger.error( 'MSCONVERT failed for {-/dataset: || RAW: \n {}' .format(STUDY, dataset_id, raw_basename, exc_info=e)) print("End SYNOPSIS/FIRST-HITS FILE generation") else: print("Can't run without emsl_to_jgi.json? ")