def filter_uniprot_exclusion_list(df, config, _filter): """ Filter proteins from exclusion list using UniProt IDs """ exclusion_list = [] # parse exclusion list # if exclusion_list param is a path, then load the IDs from that path if 'file' in _filter and _filter['file'] is not None: # load UniProt IDs from file line-by-line # first expand user or any vars _filter['file'] = os.path.expanduser(_filter['file']) _filter['file'] = os.path.expandvars(_filter['file']) # open the exclusion list file and read in the UniProt IDs, line by line try: with open(_filter['file'], 'r') as f: logger.info( 'Loading UniProt IDs from exclusion list file {} ...'. format(_filter['file'])) exclusion_list = [line.rstrip('\n') for line in f] logger.info('Loaded {} proteins from exclusion list.'.format( len(exclusion_list))) except EnvironmentError: raise ConfigFileError( 'Exclusion list file {} not found. Please provide a path to a file with UniProt IDs separated by line' .format(_filter['file'])) elif 'list' in _filter and len(_filter['list']) > 0: # load UniProt IDs from the configuration file exclusion_list = _filter['list'] logger.info( 'Loading {} UniProt IDs from exclusion list as defined in config file' .format(len(exclusion_list))) else: raise ConfigFileError( 'No exclusion list file or list of UniProt IDs provided. Please provide a path to a file with UniProt IDs separated by line with the \"file\" key, or provide a python list of UniProt IDs with the \"list\" key. If not using a UniProt ID exclusion list, then comment out the \"uniprot_exclusion\" key from the filter list.' ) # filter exclusion list if len(exclusion_list) > 0: logger.info( 'UniProt IDs from exclusion list: {}'.format(exclusion_list)) # we could only match the excluded IDs to the razor protein, # but we can be more strict and match the blacklisted IDs to the entire protein # string, containing all possible proteins pat = reduce((lambda x, y: x + '|' + y), exclusion_list) blacklist_filter = df['proteins'].str.contains(pat) blacklist_filter[pd.isnull(blacklist_filter)] = False logger.info('Filtering out {} PSMs from the exclusion list'.format( np.sum(blacklist_filter))) return blacklist_filter else: raise ConfigFileError( 'Exclusion list found and loaded, but no UniProt IDs found. Check the format of the file, or the list in the config file.' )
def convert(df, config): cols = [] col_names = [] # loop thru all columns listed in the config file for col in list(config['col_names'].keys()): if config['col_names'][col] is None: logger.debug( 'Column \"{}\" is left empty in the config file. Skipping...'. format(col)) continue # check if the column specified in the config file exists in the df or not if config['col_names'][col] not in df.columns: # this is probably grounds to kill the program raise ConfigFileError( 'Column {} of value {} not found in the input file. Please check that this column exists. Or, comment out the field or leave the field for {} empty in the config file.' .format(col, config['col_names'][col], col)) # keep track of the column and the column name cols.append(config['col_names'][col]) col_names.append(col) # take the subset of the input file, and also rename the columns dfa = df[cols] dfa.columns = col_names return dfa
def filter_psms(df, config): logger.info('Filtering PSMs...') # load the filtering functions specified by the input config # types of filter functions depends on what stage of filtering this is: # removing observations or merely excluding from alignment filters = config['filters'] # each filter has a specified required column from the dataframe # make sure these columns exist before proceeding for i, f in enumerate(filters): # for each required column in the filter, check if it exists for j in required_cols[f['name']]: if j not in df.columns: raise ConfigFileError( 'Filter {} required a data column {}, but this was not found in the input dataframe.' .format(f['name'], j)) # by default, filter out nothing. we'll use binary ORs (|) to # gradually add more and more observations to this filter out blacklist df['remove'] = np.repeat(False, df.shape[0]) # run all the filters specified by the list in the input config file # all filter functions are passed df, and the run configuration # after each filter, append it onto the exclusion master list with a bitwise OR # if the filter function returns None, then just ignore it. for i, f in enumerate(filters): e = filter_funcs[f['name']](df, config, f) if e is not None: df['remove'] = (df['remove'] | e) return df
def load_params_from_file(params_folder): # first expand user or any vars params_folder = os.path.expanduser(params_folder) params_folder = os.path.expandvars(params_folder) # load parameters if they are specified in the command line params = {} logger.info( 'Using provided alignment parameters. Loading params from {}...'. format(params_folder)) param_files = ['exp_params.txt', 'pair_params.txt', 'peptide_params.txt'] for pf in param_files: pfp = os.path.join(params_folder, pf) if os.path.exists(pfp): try: params[pf.split('_')[0]] = pd.read_csv(pfp, sep='\t') except: logger.error('Error loading param file') else: error_msg = 'Params file {} does not exist'.format(pfp) raise ConfigFileError(error_msg) logger.info('Loaded \"{}\" params file.'.format(pf.split('_')[0])) return params
def filter_retention_length(df, config, _filter): """ Filter by retention length, which is a measure of the peak width during chromatography. """ if _filter['dynamic']: # use the dynamic filter, where the value is a proportion # of the max RT (the run-time) of that raw file logger.info( 'Using dynamic retention length of {} * run-time (max RT) for each experiment' .format(_filter['value'])) # get the max RT for each raw file, reindex to the same dimension as the # retention_length column, and then multiply by the filter value max_rts = df.groupby('raw_file')['retention_time'].max().values filter_rtl = max_rts[df['raw_file'].map({ ind: val for val, ind in enumerate(np.sort(df['raw_file'].unique())) })] * _filter['value'] filter_rtl = (df['retention_length'] > filter_rtl) else: # use a constant filter for the retention length logger.info( 'Using constant retention length (in RT) of {} for all raw files.'. format(_filter['value'])) # only allow values between 0 and max(RT) if _filter['value'] <= 0 or _filter['value'] > np.max( df['retention_time']): raise ConfigFileError( '\"retention_length filter\" {} is not defined or incorrectly defined. Please provide a decimal number between 0.0 and max(RT).' .format(_filter['value'])) filter_rtl = (df['retention_length'] > _filter['value']) if _filter['dynamic']: logger.info( 'Filtering out {} PSMs with retention length greater than {:.4f} * max(exp_RT) of each raw file.' .format(np.sum(filter_rtl), _filter['value'])) else: logger.info( 'Filtering out {} PSMs with retention length greater than {:.4f}'. format(np.sum(filter_rtl), _filter['value'])) return filter_rtl
def get_model_from_config(config): model = 'two_piece_linear' if config['model'] is not None: if config['model'] in models: model = config['model'] else: error_msg = 'Model \"{}\" not found. Available choices are: {}'.format( model, models.keys()) raise ConfigFileError(error_msg) else: logger.info( 'Alignment model not defined. Defaulting to \"two_piece_linear\" model' ) return models[model]
def process_files(config): # create our output data frames df_original = pd.DataFrame() df = pd.DataFrame() # iterate through each input file provided. for i, f in enumerate(config['input']): # first expand user or any vars f = os.path.expanduser(f) f = os.path.expandvars(f) logger.info('Reading in input file #{} | {} ...'.format(i + 1, f)) # load the input file with pandas # # have a variable low memory option depending on the input type. # MaxQuant, for example, has a structure that forces pandas out of its # optimal low memory mode, and we have to specify it here. dfa = pd.read_csv(f, sep='\t', low_memory=config['low_memory']) # keep track of where observations came from. this is _not_ the raw file ID # but instead the ID from which input file it originated from, so that if # we need to split these observations up by input file in the future we can do so dfa['input_id'] = i # append a copy of dfa into df_original, because the conversion process will heavily # modify dfa. we need to keep a copy of the original dataframe in order to append # the new columns back onto it later. # re-index columns with '[dfa.columns.tolist()]' to preserve the general column order df_original = df_original.append(dfa, sort=True)[dfa.columns.tolist()] # if this input data already has DART-ID columns in it, then drop them, # since they cause problems later dart_cols = [ 'rt_minus', 'rt_plus', 'mu', 'muij', 'sigmaij', 'pep_new', 'exp_id', 'peptide_id', 'stan_peptide_id', 'exclude', 'residual', 'pep_updated', 'q-value' ] # print a warning if we see any if np.any(df_original.columns.isin(dart_cols)): logger.warning( 'Columns {} are recognized as DART-ID output columns. Removing these columns before proceeding. In the future, please input original input data files, not output files from DART-ID.' .format( np.array_str(df_original.columns[df_original.columns.isin( dart_cols)]))) # drop existing dart cols for col in dart_cols: if col in df_original.columns: logger.debug('Removing column {}'.format(col)) df_original = df_original.drop(col, axis=1) logger.info('Converting {} ({} PSMs)...'.format(f, dfa.shape[0])) # convert - takes subset of columns and renames them dfa = convert(dfa, config) # need to reset the input_id after the conversion process dfa['input_id'] = i # append to master dataframe df = df.append(dfa) # modify columns? # append the ion charge to the sequence # also make sure the charge column is specified and exists if config['add_charge_to_sequence'] and 'charge' in df.columns: logger.info( 'Appending charge to peptide sequence, to align different charge states separately.' ) df['sequence'] = df['sequence'] + '_' + df['charge'].apply(str) # create a unique ID for each PSM to help with stiching the final result together # after all of our operations df['id'] = range(0, df.shape[0]) df_original['id'] = range(0, df.shape[0]) # by default, exclude nothing from the original experiment df_original['remove'] = np.repeat(False, df_original.shape[0]) # if the input already has an 'remove' column, then skip this step if 'remove' in config['col_names'] and config['col_names'][ 'remove'] is not None: df['remove'] = df['remove'].astype(bool) else: # otherwise, run the filters df = filter_psms(df, config) # apply non-optional filters, PEP threshold and requirement that # sequence is observed in at least n experiments (num_experiments) # remove any observations with null pep null_pep = pd.isnull(df['pep']) if np.sum(null_pep) > 0: df['remove'] = ((df['remove']) | (null_pep)) logger.info('Removing {} PSMs with no PEP entry.'.format( np.sum(null_pep))) num_exps = len(df['raw_file'].unique()) if config['num_experiments'] > num_exps: raise ConfigFileError( 'Number of experiments filter threshold {} is greater than the number of experiments in the input list. Please provide an integer greater than or equal to 1 and less than the number of experiments with the \"num_experiments\" key.' .format(config['num_experiments'])) # count the number of experiments a peptide is observed in, but filter out # 1) PSMs removed from previous filters # 2) PSMs with PEP > pep_threshold exps_per_pep = df[-((df['remove']) | (df['pep'] >= config['pep_threshold']))].groupby( 'sequence')['raw_file'].unique().apply( (lambda x: len(x))) # map values to DataFrame. peptides without any value will get NaN, # which will then be assigned to 0. exps_per_pep = df['sequence'].map(exps_per_pep) exps_per_pep[pd.isnull(exps_per_pep)] = 0 # flag these sequences for removal as well logger.info( 'Removing {} PSMs from peptide sequences not observed confidently in more than {} experiments' .format(np.sum(exps_per_pep < config['num_experiments']), config['num_experiments'])) df['remove'] = (df['remove'] | (exps_per_pep < config['num_experiments'])) # check that every experiment has at least n PSMs available for alignment. # if not, then exclude them from alignment psms_per_exp = df.groupby('raw_file')['remove'].apply( lambda x: np.sum(x < config['pep_threshold'])) exclude_exps = psms_per_exp.index.values[ psms_per_exp < config['min_psms_per_experiment']] if len(exclude_exps) > 0: logger.warning( 'Experiments {} have < {} confident PSMs (PEP < {}) remaining after filtering. All PSMs belonging to these experiments will be excluded from the retention time alignment' .format(np.array_str(exclude_exps), config['min_psms_per_experiment'], config['pep_threshold'])) # exclude experiments without enough PSMs df['remove'] = (df['remove'] | df['raw_file'].isin(exclude_exps)) # recalculate exps_per_pep, since we removed some experiments and this # number will change based on the set of experiments we consider logger.info( 'Recalculating number of confident peptides across experiments...') exps_per_pep = df[-((df['remove']) | (df['pep'] >= config['pep_threshold']))].groupby( 'sequence')['raw_file'].unique().apply( (lambda x: len(x))) exps_per_pep = df['sequence'].map(exps_per_pep) exps_per_pep[pd.isnull(exps_per_pep)] = 0 logger.info( 'Additional {} PSMs from peptide sequences not observed confidently in more than {} experiments flagged for removal.' .format( np.sum(exps_per_pep < config['num_experiments']) - np.sum(df['remove']), config['num_experiments'])) df['remove'] = (df['remove'] | (exps_per_pep < config['num_experiments'])) ## -------------- ## DONE FILTERING ## -------------- # flag the observations in df_original that were removed df_original['remove'] = df['remove'] # remove the flagged observations from the dataframe, and reset index df = df[df['remove'] == False].reset_index(drop=True) # map peptide and experiment IDs # sort experiment IDs alphabetically - or else the order is by # first occurrence of an observation of that raw file # if experiment or peptide IDs are already provided, then skip this step if 'exp_id' not in config[ 'col_names'] or config['col_names']['exp_id'] is None: df['exp_id'] = df['raw_file'].map({ ind: val for val, ind in enumerate(np.sort(df['raw_file'].unique())) }) logger.info( '{} experiments (raw files) loaded'.format(np.max(df['exp_id']) + 1)) if 'peptide_id' not in config[ 'col_names'] or config['col_names']['peptide_id'] is None: df['peptide_id'] = df['sequence'].map( {ind: val for val, ind in enumerate(df['sequence'].unique())}) logger.info( '{} peptide sequences loaded'.format(np.max(df['peptide_id']) + 1)) # EXCLUSION = PSM does not participate in alignment, but will participate in # confidence update since the PSM's associated peptide will get # parameters from the alignment. # This is NOT the same as "remove", which means that the PSM's # associated peptide does not have enough PSMs to participate # in alignment and therefore receive parameters. # flag non-confident PSMs for exclusion from alignment process df['exclude'] = (df['pep'] >= config['pep_threshold']) logger.info( 'Excluding {} / {} ({:.2%}) PSMs from alignment process after filtering at PEP threshold of {}' .format(np.sum(df['pep'] >= config['pep_threshold']), df.shape[0], np.sum(df['pep'] >= config['pep_threshold']) / df.shape[0], config['pep_threshold'])) # only take the four required columns (+ the IDs) with us # the rest were only needed for filtering and can be removed df = df[[ 'sequence', 'raw_file', 'retention_time', 'pep', 'exp_id', 'peptide_id', 'input_id', 'id', 'exclude' ]] # sort by peptide_id, exp_id df = df.sort_values(['peptide_id', 'exp_id']) return df, df_original
def filter_smears(df, config, _filter): """ Filter out "smears". even confidently identified PSMs can have bad chromatography, and in that case it is unproductive to include them into the alignment. In theory, this should be made redundant by the retention length filter, but some PSMs still slip through the cracks of that, possibly because the search engine cannot adequately track the elution peak? """ logger.info('Determining RT spread of peptides within each experiment...') # for each experiment-peptide pair, get the range of retention times # this is the step that could take a long time # TODO: optimize this? smears = df.groupby(['raw_file', 'sequence'])['retention_time'].apply(np.ptp) if _filter['dynamic']: # use the dynamic filter, where the value is a proportion # of the max RT (the run-time) of that raw file logger.info( 'Using dynamic smear length (in RT) of {:.4f} * run-time (max RT) for each experiment' .format(_filter['value'])) max_rts = df.groupby('raw_file')['retention_time'].max().values smear_pair_inds = smears.index.to_frame()['raw_file'].values smear_pair_inds = pd.Series(smear_pair_inds).map({ ind: val for val, ind in enumerate(np.sort(df['raw_file'].unique())) }) # get the (raw_file, sequence) tuples for PSMs with a range above the threshold smears = smears[smears > (max_rts[smear_pair_inds] * _filter['value'])].index.values else: # use a constant filter for the retention length logger.info( 'Using constant smear length (in RT) of {:.4f} for all raw files.'. format(_filter['value'])) if _filter['value'] <= 0: raise ConfigFileError( 'Smear filter {:.4f} is not defined or incorrectly defined. Please provide a decimal number between 0.0 and max(RT).' .format(_filter['value'])) # get the (exp_id, peptide_id) tuples for PSMs with a range above the threshold smears = smears[smears > _filter['value']].index.values # map the tuples back to the original data frame, and set smears to be excluded smears = pd.Series(list(zip(df['raw_file'], df['sequence']))).isin(smears) if _filter['dynamic']: logger.info( 'Filtering out {} PSMs with an intra-experiment RT spread greater than {:.4f} * max(exp_RT) for each raw file.' .format(smears.sum(), _filter['value'])) else: logger.info( 'Filtering out {} PSMs with an intra-experiment RT spread greater than {:.4f}' .format(smears.sum(), _filter['value'])) return smears.values
def read_config_file(args, create_output_folder=True): # load defaults config = read_default_config_file() # override defaults with user config file with open(args.config_file.name, 'r') as f: config.update(yaml_load(f, Loader=Loader)) # override config file's input, output, and verbose options # if they were specified on the command-line if args.input is not None: if config['input'] is not None: logger.warning( 'Input files specified in both the config file and the command line. Using command-line input files instead.' ) config['input'] = [f.name for f in args.input] if args.output is not None: if 'output' in config and config['output'] is not None: logger.warning( 'Output folder specified in both the config file and the command line. Using command-line output folder instead.' ) config['output'] = args.output if args.verbose: if 'verbose' in config: logger.info( 'Overwriting verbosity level in configuration file with the one provided on the command-line.' ) config['verbose'] = args.verbose # make sure that we have inputs and outputs before continuing # the jsonschema validator will catch this as well but here we can print # a more descriptive error message if 'input' not in config or config['input'] is None: error_msg = 'No input files specified, in either the config file or the command line. Please provide input files.' raise ConfigFileError(error_msg) if 'output' not in config or config['output'] is None: error_msg = 'No output folder specified, in either the config file or the command line. Please provide output folder.' raise ConfigFileError(error_msg) ### -------------------- ### VALIDATE CONFIG FILE ### -------------------- schema = pkg_resources.resource_stream('dart_id', '/'.join( ('config', 'schema.yaml'))) schema = yaml_load(schema, Loader=Loader) v = Draft7Validator(schema) errors = sorted(v.iter_errors(config), key=str) for error in errors: logger.error('Configuration file error:\n' + 'In field: {}\n'.format( ' --> '.join(['\'' + str(x) + '\'' for x in error.path])) + 'Error: {}\n'.format(error.message, ) + 'Field description: {}\n'.format( error.schema['description'] if 'description' in error.schema else 'No field description provided')) #for suberror in sorted(error.context, key=lambda e: e.schema_path): # print('suberror') # print(list(suberror.schema_path), suberror.message, sep=", ") if len(errors) > 0: error_msg = '{} error(s) from configuration file. Please read the validation error messages carefully and fix the configuration file'.format( len(errors)) raise ConfigFileError(error_msg) ### ==================================================== ### ADVANCED CONFIGURATION FILE VALIDATION ### -------------------------------------- ### apply rules too complex for the jsonschema validator ### ==================================================== # ... # expand user or any vars config['output'] = os.path.expanduser(config['output']) config['output'] = os.path.expandvars(config['output']) # create output folder if not os.path.exists(config['output']) and create_output_folder: logger.info('Output folder does not yet exist. Creating...') os.makedirs(config['output']) # copy config file to output folder if create_output_folder: logger.info('Copying config file to output folder') copyfile( args.config_file.name, os.path.join(config['output'], os.path.basename(args.config_file.name))) ### ------------------ ### Modify config file ### ------------------ # Apply modifications/transformations # Decode ASCII escape characters in the sep string # Assumes config file is in utf-8 config['sep'] = config['sep'].encode('utf-8').decode('unicode-escape') return config
def process_files(config): # create our output data frames df_original = pd.DataFrame() df = pd.DataFrame() # iterate through each input file provided. for i, f in enumerate(config['input']): # first expand user or any vars f = os.path.expanduser(f) f = os.path.expandvars(f) logger.info('Reading in input file #{} | {} ...'.format(i + 1, f)) # load the input file with pandas # # have a variable low memory option depending on the input type. # MaxQuant, for example, has a structure that forces pandas out of its # optimal low memory mode, and we have to specify it here. dfa = pd.read_csv(f, sep=config['sep'], low_memory=config['low_memory']) # keep track of where observations came from. this is _not_ the raw file ID # but instead the ID from which input file it originated from, so that if # we need to split these observations up by input file in the future we can do so dfa['input_id'] = i # append a copy of dfa into df_original, because the conversion process will heavily # modify dfa. we need to keep a copy of the original dataframe in order to append # the new columns back onto it later. # re-index columns with '[dfa.columns.tolist()]' to preserve the general column order df_original = df_original.append(dfa, sort=True)[dfa.columns.tolist()] # if this input data already has DART-ID columns in it, then drop them, # since they cause problems later dart_cols = [ 'rt_minus', 'rt_plus', 'mu', 'muij', 'sigmaij', 'pep_new', 'exp_id', 'peptide_id', 'stan_peptide_id', 'exclude', 'residual', 'pep_updated', 'q-value' ] # print a warning if we see any if np.any(df_original.columns.isin(dart_cols)): logger.warning( 'Columns {} are recognized as DART-ID output columns. Removing these columns before proceeding. In the future, please input original input data files, not output files from DART-ID.' .format( np.array_str(df_original.columns[df_original.columns.isin( dart_cols)]))) # drop existing dart cols for col in dart_cols: if col in df_original.columns: logger.debug('Removing column {}'.format(col)) df_original = df_original.drop(col, axis=1) logger.info('Converting {} ({} PSMs)...'.format(f, dfa.shape[0])) # convert - takes subset of columns and renames them dfa = convert(dfa, config) # need to reset the input_id after the conversion process dfa['input_id'] = i # append to master dataframe df = df.append(dfa) # modify columns? # append the ion charge to the sequence # also make sure the charge column is specified and exists if config['add_charge_to_sequence'] and 'charge' in df.columns: logger.info( 'Appending charge to peptide sequence, to align different charge states separately.' ) df['sequence'] = df['sequence'] + '_' + df['charge'].apply(str) # create a unique ID for each PSM to help with stiching the final result together # after all of our operations df['id'] = range(0, df.shape[0]) df_original['id'] = range(0, df.shape[0]) # by default, exclude nothing from the original experiment df_original['remove'] = np.repeat(False, df_original.shape[0]) # if the input already has an 'remove' column, then skip this step if 'remove' in config['col_names'] and config['col_names'][ 'remove'] is not None: df['remove'] = df['remove'].astype(bool) else: # otherwise, run the filters df = filter_psms(df, config) # apply non-optional filters, PEP threshold and requirement that # sequence is observed in at least n experiments (num_experiments) # remove any observations with null pep null_pep = pd.isnull(df['pep']) if np.sum(null_pep) > 0: df['remove'] = ((df['remove']) | (null_pep)) logger.info('Removing {} PSMs with no PEP entry.'.format( np.sum(null_pep))) num_exps = len(df['raw_file'].unique()) # Special error when only one experiment is loaded if num_exps == 1: error_msg = 'Only 1 raw file/experiment loaded. DART-ID derives statistical power from peptides observed over multiple experiments. Please provide an input file with more raw files, or provide a list of input files, to get the most out of your data.' raise ConfigFileError(error_msg) if config['num_experiments'] > num_exps: error_msg = 'Number of experiments filter threshold {} is greater than the number of experiments in the input list. Please provide an integer greater than or equal to 1 and less than the number of experiments with the \"num_experiments\" key.'.format( config['num_experiments']) raise ConfigFileError(error_msg) # Calculate FDR df['qval'] = pep_to_fdr(df['pep']) # Count the number of experiments a peptide is observed in, but filter out # 1) PSMs removed from previous filters # 2) PSMs with PEP > pep_threshold exps_per_pep = ( df.loc[ # Get peptides that are: ( # Not previously removed, for any reason (~df['remove']) & # Are below the set confidence threshold (df['pep'] < config['pep_threshold']) # (df['qval'] < config['pep_threshold']) # peptide FDR ), ['sequence', 'raw_file']] # Group by sequence, get all unique raw files the peptide sequence # appears in, then count the number of raw files .groupby('sequence')['raw_file'].unique().apply(len)) # map values to DataFrame. peptides without any value will get NaN, # which will then be assigned to 0. exps_per_pep = df['sequence'].map(exps_per_pep) exps_per_pep[pd.isnull(exps_per_pep)] = 0 # flag these sequences for removal as well logger.info( 'Removing {} PSMs from peptide sequences not observed confidently in more than {} experiments' .format(np.sum(exps_per_pep < config['num_experiments']), config['num_experiments'])) df['remove'] = (df['remove'] | (exps_per_pep < config['num_experiments'])) # check that every experiment has at least n PSMs available for alignment. # if not, then exclude them from alignment psms_per_exp = df.groupby('raw_file')['remove'].apply( lambda x: np.sum(x < config['pep_threshold'])) exclude_exps = psms_per_exp.index.values[ psms_per_exp < config['min_psms_per_experiment']] if len(exclude_exps) > 0: logger.warning( 'Experiments {} have < {} confident PSMs (PEP < {}) remaining after filtering. All PSMs belonging to these experiments will be excluded from the retention time alignment' .format(np.array_str(exclude_exps), config['min_psms_per_experiment'], config['pep_threshold'])) # exclude experiments without enough PSMs df['remove'] = (df['remove'] | df['raw_file'].isin(exclude_exps)) # recalculate exps_per_pep, since we removed some experiments and this # number will change based on the set of experiments we consider logger.info( 'Recalculating number of confident peptides across experiments...') exps_per_pep = ( df.loc[ # Get peptides that are: ( # Not previously removed, for any reason (~df['remove']) & # Are below the set confidence threshold (df['pep'] < config['pep_threshold'])), ['sequence', 'raw_file']] # Group by sequence, get all unique raw files the peptide sequence # appears in, then count the number of raw files .groupby('sequence')['raw_file'].unique().apply(len)) exps_per_pep = df['sequence'].map(exps_per_pep) logger.info( 'Additional {} PSMs from peptide sequences not observed confidently in more than {} experiments flagged for removal.' .format(np.sum(exps_per_pep < config['num_experiments']), config['num_experiments'])) exps_per_pep[pd.isnull(exps_per_pep)] = 0 df['remove'] = (df['remove'] | (exps_per_pep < config['num_experiments'])) # Exclude low-confidence PEPs from alignment (PEP > 0.01) if the # coefficient of variantion (CV) of their PEPs is CV > 0.01. # We found that this is a good predictor of whether or not # the PSM is a decoy hit versus a target hit. def cv(x): if len(x) < 3: return np.nan return np.nanstd(x) / np.nanmean(x) peptide_aggs = { 'pep_mean': ('pep', np.nanmean), 'pep_cv': ('pep', cv), 'pep_min': ('pep', np.min), 'num_obs': ('pep', 'count') } # If we have the protein_decoy_tag and the leading_proteins column, # Look for the protein_decoy_tag to determine whether or not the peptide is a decoy peptide if 'leading_protein' in config[ 'col_names'] and 'protein_decoy_tag' in config: def is_decoy(x): return x.str.contains(config['protein_decoy_tag']).any() peptide_aggs['is_decoy'] = ('leading_protein', is_decoy) peptides_df = ( df.groupby('sequence').aggregate(**peptide_aggs) # Only take peptides with more than N observations .query('num_obs > 3') # Remove any extremely low CVs .query('pep_cv > 1e-5') # Remove extremely low PEP means .query('pep_mean > 1e-10')) # If we have decoy data, then perform a logistic regression with # the pep_mean and pep_cv as features if 'is_decoy' in peptides_df.columns: logger.info( 'Decoy peptide information present. Running logistic regression to avoid aligning decoy peptides' ) # X = feature matrix X = np.log10(peptides_df.loc[:, ['pep_mean', 'pep_cv']].values) X = StandardScaler().fit_transform(X) # True = Decoy, False = Target y = peptides_df['is_decoy'].values random_state = np.random.RandomState(0) # shuffle and split training and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0) # Learn to predict each class against the other classifier = svm.SVC(kernel='linear', probability=True, random_state=random_state) classifier.fit(X_train, y_train) y_score = classifier.decision_function(X_test) # Compute ROC curve and ROC area fpr, tpr, thresholds = roc_curve(y_test, y_score, pos_label=True) roc_auc = auc(fpr, tpr) # If the curve is inverted, run with the opposite pos_label inverted = False if roc_auc < 0.5: logger.info('Correcting inverted ROC curve') inverted = True fpr, tpr, thresholds = roc_curve(y_test, y_score, pos_label=False) roc_auc = auc(fpr, tpr) logger.info('AUC: {:.3f}'.format(roc_auc)) # Maximize the Youden-Index (sensitivity (TP / P = TPR) + specificity (TN / N = 1 - FPR)) # But set a MINIMUM TPR of 0.8. We don't want to cut out too many of our targets cutoff_start_ind = np.argmax( tpr >= 0.8) # argmax gets the first ind of the max value cutoff_ind = cutoff_start_ind + np.argmax(tpr[cutoff_start_ind:] - fpr[cutoff_start_ind:]) cutoff_thresh = thresholds[cutoff_ind] logger.info('ROC Cutoff: FPR = {:.2f}, TPR = {:.2f}'.format( fpr[cutoff_ind], tpr[cutoff_ind])) # Generate scores for all points all_y_score = classifier.decision_function(X) # Points above the cutoff threshold are decoys remove_inds = all_y_score >= cutoff_thresh if inverted: remove_inds = ~remove_inds logger.info('Logistic regression is removing {} peptides'.format( np.sum(remove_inds))) # If we don't have decoy information, then run with some preset cutoffs if 'is_decoy' not in peptides_df.columns: min_pep_thresh = 0.01 max_pep_cv_thresh = 0.1 remove_inds = ((peptides_df['pep_min'] > min_pep_thresh) & (peptides_df['pep_cv'] < max_pep_cv_thresh)) logger.info( 'Removing {} peptides for min(PEP) > {:.3f} and CV(PEP) < {:.3f}'. format(np.sum(remove_inds), min_pep_thresh, max_pep_cv_thresh)) remove_seqs = peptides_df.index[remove_inds].values df['remove'] = (df['remove'] | df['sequence'].isin(remove_seqs)) # filtered_out = remove_seqs['is_decoy'] & (remove_seqs['pep_cv'] < 0.3) & (remove_seqs['pep_min'] > 0.01) # print('Removed', np.sum(remove_seqs.loc[filtered_out, 'num_obs']), 'out of', np.sum(remove_seqs.loc[remove_seqs['is_decoy'], 'num_obs']), 'decoy PSMs') ## -------------- ## DONE FILTERING ## -------------- # flag the observations in df_original that were removed df_original['remove'] = df['remove'] # remove the flagged observations from the dataframe, and reset index df = df[df['remove'] == False].reset_index(drop=True) # map peptide and experiment IDs # sort experiment IDs alphabetically - or else the order is by # first occurrence of an observation of that raw file # if experiment or peptide IDs are already provided, then skip this step if 'exp_id' not in config[ 'col_names'] or config['col_names']['exp_id'] is None: df['exp_id'] = df['raw_file'].map({ ind: val for val, ind in enumerate(np.sort(df['raw_file'].unique())) }) logger.info( '{} experiments (raw files) loaded'.format(np.max(df['exp_id']) + 1)) if 'peptide_id' not in config[ 'col_names'] or config['col_names']['peptide_id'] is None: df['peptide_id'] = df['sequence'].map( {ind: val for val, ind in enumerate(df['sequence'].unique())}) logger.info( '{} peptide sequences loaded'.format(np.max(df['peptide_id']) + 1)) # EXCLUSION = PSM does not participate in alignment, but will participate in # confidence update since the PSM's associated peptide will get # parameters from the alignment. # This is NOT the same as "remove", which means that the PSM's # associated peptide does not have enough PSMs to participate # in alignment and therefore receive parameters. # flag non-confident PSMs for exclusion from alignment process df['exclude'] = (df['pep'] >= config['pep_threshold']) logger.info( 'Excluding {} / {} ({:.2%}) PSMs from alignment process after filtering at PEP threshold of {}' .format(np.sum(df['pep'] >= config['pep_threshold']), df.shape[0], np.sum(df['pep'] >= config['pep_threshold']) / df.shape[0], config['pep_threshold'])) # only take the four required columns (+ the IDs) with us # the rest were only needed for filtering and can be removed df = df[[ 'sequence', 'raw_file', 'retention_time', 'pep', 'exp_id', 'peptide_id', 'input_id', 'id', 'exclude' ]] # sort by peptide_id, exp_id df = df.sort_values(['peptide_id', 'exp_id']) return df, df_original
def update(dfa, params, config): dfa = dfa.reset_index(drop=True) #logger.info('{} / {} ({:.2%}) confident, alignable observations (PSMs) after filtering.'.format(dff.shape[0], dfa.shape[0], dff.shape[0] / dfa.shape[0])) # refactorize peptide id into stan_peptide_id, # to preserve continuity when feeding data into STAN dfa['stan_peptide_id'] = dfa['sequence'].map( {ind: val for val, ind in enumerate(dfa['sequence'].unique())}) num_experiments = dfa['exp_id'].max() + 1 num_peptides = dfa['peptide_id'].max() + 1 exp_names = np.sort(dfa['raw_file'].unique()) pep_id_list = dfa['peptide_id'].unique() # validate parameters file. make sure it is from the same filters # or else the program will crash in the code below # check num_experiments, num_peptides if params['exp'].shape[0] != num_experiments or \ params['peptide'].shape[0] != (dfa['stan_peptide_id'].max() + 1): raise ConfigFileError( 'Parameters files have different data than the input data provided. Ensure that both the input list and filters used to generate the alignment parameters and those provided to the current update are the __exact__ same.' ) model = get_model_from_config(config) # mu from the STAN alignment dfa['mu'] = params['peptide']['mu'].values[dfa['stan_peptide_id']] # concatenate transformation parameters exp_params = pd.DataFrame({ key: params['exp'][key][dfa['exp_id']] \ for key in model['exp_keys']}).reset_index(drop=True) dfa = pd.concat([dfa, exp_params], axis=1) # predict mus with RTs, and RTs with aligned mus dfa['mu_pred'] = model['rt_to_ref'](dfa, dfa['mu'], params) dfa['muij'] = model['ref_to_rt'](dfa, dfa['mu'], params) dfa['sigmaij'] = model['sigmaij_func'](dfa, params) # scaled sigma is the same ratio of muij / mu applied to sigmaij dfa['sigma_pred'] = dfa['sigmaij'] * dfa['mu_pred'] / dfa['muij'] # get parameters for the null distributions for each experiment null_dists = dfa.groupby('exp_id')['retention_time'].agg([np.mean, np.std]) #null_dists = np.array([norm(loc=null_dists.loc[i, 'mean'], scale=null_dists.loc[i, 'std']) for i in range(0, num_experiments)]) # first column is mean, second is std null_dists = np.array( [null_dists['mean'].values, null_dists['std'].values]).T # PEP ceiling at 1, otherwise will result in # incorrect negative densities when plugging into Bayes' theorem dfa['pep'][dfa['pep'] > 1.0] = 1.0 # output table df_new = pd.DataFrame() bootstrap_method = 'none' if 'bootstrap_method' in config: bootstrap_method = config['bootstrap_method'] logger.info('Using \"{}\" bootstrap method'.format(bootstrap_method)) else: logger.info( 'Bootstrap method not defined, using point estimates to update confidence instead.' ) k = 20 # default if 'bootstrap_iters' in config: k = config['bootstrap_iters'] if bootstrap_method != 'none': logger.info('Using {} bootstrap iterations'.format(k)) logger.info('Updating PEPs...') for i, e in enumerate(np.sort(dfa['exp_id'].unique())): exp_name = exp_names[i] exp = dfa[dfa['exp_id'] == e] exp = exp.reset_index(drop=True) exp_peptides = exp['stan_peptide_id'].unique() logger.info('Exp ({} / {}) - {} - ({} Peptides, {} PSMs)'.format( i + 1, num_experiments, exp_name, len(exp_peptides), exp.shape[0])) # vector of P(RT|delta=1) for this experiment. rt_plus = pd.Series(np.zeros(exp.shape[0])) if bootstrap_method != 'none': # to avoid using this experiment's own data to update the confidence # of its own observations, recalculate the reference RTs (mu) without the # data from this experiment, by: # 1) non-parametric bootstrapping over the median of the predicted mus. # OR # 2) parametric bootstrapping, using the RT distribution parameters # get predicted mus of peptides in this experiment, excluding predicted mus # transformed from RTs observed in this experiment dfe = dfa.loc[((dfa['stan_peptide_id'].isin(exp_peptides)) & (dfa['exp_id'] != e)), \ ['stan_peptide_id', 'pep', 'mu_pred', 'mu', 'sigma_pred', 'exp_id']] # extract relevant values for each peptide mu_preds = dfe.groupby('stan_peptide_id')['mu_pred'].apply( lambda x: x.values).values.tolist() mus = dfe.groupby('stan_peptide_id')['mu'].apply( lambda x: x.values).values.tolist() sigma_preds = dfe.groupby('stan_peptide_id')['sigma_pred'].apply( lambda x: x.values).values.tolist() peps = dfe.groupby('stan_peptide_id')['pep'].apply( lambda x: x.values).values.tolist() exp_ids = dfe.groupby('stan_peptide_id')['exp_id'].apply( lambda x: x.values).values.tolist() # number of observations per peptide sequence obs_per_seq = [len(peptide) for peptide in mu_preds] num_peptides = len(mu_preds) # the number of observations per peptide -- used in loop num_obs = 0 # matrix of n by k estimated mus from the bootstrapping # will iterate over in the loop after the immediate one mu_k = np.zeros((num_peptides, k)) if bootstrap_method == 'parametric' or bootstrap_method == 'parametric_mixture' or bootstrap_method == 'parametric-mixture': t_laplace_samples = 0 t_coin_flips = 0 t_null_samples = 0 t_loop_indexing = 0 t_medians = 0 # create pool of coin flips, instead of sampling for every peptide # the pool is uniformly distributed from 0 to 1, and "successful" coin flip # is determined by whether or not the sample from the pool is less than the # measured PEP _time = time.time() coin_flip_pool = 0 if bootstrap_method == 'parametric_mixture' or bootstrap_method == 'parametric-mixture': coin_flip_pool = uniform.rvs(size=(np.sum(obs_per_seq) * k)) t_coin_flips += (time.time() - _time) coin_counter = 0 # create a pool of laplace samples, to pull from for each peptide _time = time.time() sample_pool = laplace.rvs(size=(np.sum(obs_per_seq) * k)) t_laplace_samples += (time.time() - _time) # keep track of where we are in the pool with a counter sample_counter = 0 # parametric bootstrap for i in range(0, num_peptides): num_obs = obs_per_seq[i] _time = time.time() # sample num_obs synthetic RTs for k bootstrap iterations # do the sampling in a big pool, then shape to matrix where # rows correspond to bootstrap iters and columns correspond to sample observations #samples = laplace.rvs(size=(k * num_obs)).reshape(k, num_obs) # draw samples from sample pool, reshape into matrix _time = time.time() samples = sample_pool[sample_counter:(sample_counter + (k * num_obs))] samples = samples.reshape(k, num_obs) # increment sample counter sample_counter += (k * num_obs) #mu_med = np.median() # shift and scale sampled RTs by mu and sigma_pred, respectively samples = (samples * sigma_preds[i]) + mu_preds[i] #samples = (samples * sigma_preds[i]) + mu_med t_laplace_samples += (time.time() - _time) if bootstrap_method == 'parametric_mixture': # sample from mixture distribution _time = time.time() # actually faster to just replicate the sample matrix and then # take subindices from that instead of sampling from null every # iteration of the loop below. this seems inefficient, especially # if given very small PEPs, but still better than sampling every iteration. # could probably optimize the size of the null sample matrix by # looking at predicted false positive rates, but for now we're # just going with worst case scenario and assuming for all false positives. null_samples = norm.rvs(size=(k * num_obs)).reshape( k, num_obs) # shift and scale sampled RTs by mean and std of null dists null_samples = (null_samples * null_dists[exp_ids[i], 1] ) + null_dists[exp_ids[i], 0] t_null_samples += (time.time() - _time) _time = time.time() for j in range( 0, num_obs): # for each observation in the matrix # take a chunk of the coin flip pool fp = (coin_flip_pool[coin_counter:( coin_counter + k)] < peps[i][j]).astype(bool) coin_counter += k # overwrite original samples with samples from null distribution samples[fp, j] = null_samples[fp, j] t_loop_indexing += (time.time() - _time) _time = time.time() # now take the median of each row and store it in mu_k mu_k[i] = np.median(samples, axis=1) # or take the weighted mean #weights = ((1 - peps[i]) - (1 - config['pep_threshold'])) / config['pep_threshold'] #mu_k[i] = (np.sum(samples * weights, axis=1) / np.sum(weights)) t_medians += (time.time() - _time) logger.debug('laplace sampling: {:.1f} ms'.format( t_laplace_samples * 1000)) logger.debug('coin flips: {:.1f} ms'.format(t_coin_flips * 1000)) logger.debug('null sampling: {:.1f} ms'.format(t_null_samples * 1000)) logger.debug('loop indexing: {:.1f} ms'.format( t_loop_indexing * 1000)) logger.debug('taking medians: {:.1f} ms'.format(t_medians * 1000)) elif bootstrap_method == 'non-parametric': # non-parametric bootstrap # instead of generating random indices for the sampling for each # iteration, and for each peptide, we'll generate a batch of random numbers # now and pull from them later. # the counter will keep track of which portion of the pool we're using counter = 0 rand_pool = np.random.rand(np.sum(obs_per_seq) * k) for i in range(0, num_peptides): # for each peptide sequence num_obs = obs_per_seq[i] for j in range(0, k): # for each iteration: # re-estimate mu from the resampled mu_preds # TODO: choice also of mean, weighted mean mu_k[i][j] = np.median(mu_preds[i][\ (rand_pool[counter:(counter+num_obs)] * num_obs).astype(int, copy=False)]) counter = counter + num_obs _t_dist_building = time.time() # map of stan_peptide_id onto 1:num_peptides pep_inds = {ind: var for var, ind in enumerate(exp_peptides)} pep_inds = exp['stan_peptide_id'].map(pep_inds) # for each bootstrap iteration: for j in range(0, k): # evaluate the transformed RTs (predicted mus) on distributions # with the bootstrapped, estimated mus as the means. #rt_plus = rt_plus + laplace.pdf(exp['retention_time'], \ # loc=model['ref_to_rt'](exp, mu_k[:,j][pep_inds], params), \ # scale=exp['sigmaij']) rt_plus = rt_plus + laplace.pdf(exp['mu_pred'], \ loc=mu_k[:,j][pep_inds], \ scale=exp['sigma_pred']) # divide total likelihood by # of iterations to normalize to area of 1 rt_plus = rt_plus / k logger.debug('distribution building: {:.1f} ms'.format( (time.time() - _t_dist_building) * 1000)) else: # not using bootstrap, but using adjusted mu as a point estimate # for updating the confidence rt_plus = model['rt_plus_func'](exp) # P(RT|delta=0)*P(delta=0) # PEP.new = P(delta=0|RT) = --------------------------------------------------- # P(RT|delta=0)*P(delta=0) + P(RT|delta=1)*P(delta=1) # # delta=1 = Correct ID (true positive) # delta=0 = Incorrect (false positive) # P(RT|delta=0) = probability of peptides RT, given that PSM is incorrect # estimate empirical density of RTs over the experiment rt_minus = model['rt_minus_func'](exp) # P(delta=0) = probability that PSM is incorrect (PEP) # P(delta=1) = probability that PSM is correct (1-PEP) # P(RT|delta=1) = probability that given the correct ID, the RT falls in the # normal distribution of RTs for that peptide, for that experiment # delta=1 = Correct ID (true positive) # delta=0 = Incorrect (false positive) # pep_new = (rt_minus * exp['pep']) / \ ((rt_minus * exp['pep']) + (rt_plus * (1.0 - exp['pep']))) # for PSMs for which we have alignment/update data exp_new = pd.DataFrame({ 'rt_minus': rt_minus.tolist(), 'rt_plus': rt_plus.tolist(), 'mu': exp['mu'].values.tolist(), 'muij': exp['muij'].values.tolist(), 'sigmaij': exp['sigmaij'].values.tolist(), 'pep_new': pep_new.tolist(), 'id': exp['id'].values, 'exp_id': exp['exp_id'].values, 'peptide_id': exp['peptide_id'].values, 'stan_peptide_id': exp['stan_peptide_id'].values, 'input_id': exp['input_id'].values, 'exclude': exp['exclude'].values }) # append to master DataFrame and continue df_new = df_new.append(exp_new) # reorder by ID and reset the index df_new = df_new.sort_values('id') df_new = df_new.reset_index(drop=True) return df_new