def id_mapper(in_file, id_file, out_file, params, use_centroid_rt, use_centroid_mz, use_subelements): in_type = pms.FileHandler.getType(in_file) protein_ids = [] peptide_ids = [] pms.IdXMLFile().load(id_file, protein_ids, peptide_ids) mapper = pms.IDMapper() mapper.setParameters(params) if in_type == pms.Type.CONSENSUSXML: file_ = pms.ConsensusXMLFile() map_ = pms.ConsensusMap() file_.load(in_file, map_) mapper.annotate(map_, peptide_ids, protein_ids, use_subelements) addDataProcessing( map_, params, pms.DataProcessing.ProcessingAction.IDENTIFICATION_MAPPING) file_.store(out_file, map_) elif in_type == pms.Type.FEATUREXML: file_ = pms.FeatureXMLFile() map_ = pms.FeatureMap() file_.load(in_file, map_) mapper.annotate(map_, peptide_ids, protein_ids, use_centroid_rt, use_centroid_mz) addDataProcessing( map_, params, pms.DataProcessing.ProcessingAction.IDENTIFICATION_MAPPING) file_.store(out_file, map_) elif in_type == pms.Type.MZQ: file_ = pms.MzQuantMLFile() msq = pms.MSQuantifications() file_.load(in_file, msq) maps = msq.getConsensusMaps() for map_ in maps: mapper.annotate(map_, peptide_ids, protein_ids, use_subelements) addDataProcessing( map_, params, pms.DataProcessing.ProcessingAction.IDENTIFICATION_MAPPING) msq.setConsensusMaps(maps) file_.store(out_file, msq) else: raise Exception("invalid input file format")
def link(in_files, out_file, keep_subelements, params): in_types = set(pms.FileHandler.getType(in_) for in_ in in_files) if in_types == set((pms.Type.CONSENSUSXML, )): link_features = False elif in_types == set((pms.Type.FEATUREXML, )): link_features = True else: raise Exception("different kinds of input files") algorithm_parameters = params.copy("algorithm:", True) algorithm = pms.FeatureGroupingAlgorithmQT() algorithm.setParameters(algorithm_parameters) out_map = pms.ConsensusMap() fds = out_map.getColumnHeaders() if link_features: f = pms.FeatureXMLFile() maps = [] for i, in_file in enumerate(in_files): map_ = pms.FeatureMap() f.load(in_file, map_) # set filedescriptions fd = fds.get(i, pms.ColumnHeader()) fd.filename = in_file fd.size = map_.size() fd.unique_id = map_.getUniqueId() fds[i] = fd maps.append(map_) out_map.setColumnHeaders(fds) algorithm.group(maps, out_map) else: f = pms.ConsensusXMLFile() maps = [] for i, in_file in enumerate(in_files): map_ = pms.ConsensusMap() f.load(in_file, map_) maps.append(map_) algorithm.group(maps, out_map) if not keep_subelements: for i in range(len(in_files)): # set filedescriptions fd = fds.get(i, pms.ColumnHeader()) fd.filename = in_files[i] fd.size = maps[i].size() fd.unique_id = maps[i].getUniqueId() fds[i] = fd out_map.setColumnHeaders(fds) else: algorithm.transferSubelements(maps, out_map) out_map.setUniqueIds() addDataProcessing(out_map, params, pms.DataProcessing.ProcessingAction.FEATURE_GROUPING) pms.ConsensusXMLFile().store(out_file, out_map) sizes = [] for feat in out_map: sizes.append(feat.size()) c = Counter(sizes) print "Number of consensus features:" for size, count in c.most_common(): print " of size %2d : %6d" % (size, count) print " total : %6d" % out_map.size()
def align_feature_xmls(feature_xml_lis, consensus_map_out_path="", class_label_dict={}): """ first apply pose clustering to include all features maps next link/group them across all features Each MS1 spectrum from raw-file will create a feature file - we need to load and align them to get unique and representative features :param feature_xml_lis: :param consensus_map_out_path: :return: consensus_map, consensus_map_out_path, measurement_names """ # do consensus map normalization and export - # can't hack normalization together from lack of example usage and poor signature # - no normalization implemented # openms won't deal with posix paths - wants to have strings instead # need to make sure it get's those # let's sort them to make sure feature matrix is also sorted feature_xml_lis = sorted([str(fx) for fx in feature_xml_lis]) num_features_list = [] for current_feature_xml_path in feature_xml_lis: # load features into FeatureMaps cm = oms.FeatureMap() # current_map oms.FeatureXMLFile().load(current_feature_xml_path, cm) # list_functions(current_map, prefix="") num_features_list.append(cm.size()) del cm # should choose the feature file / experiment with most features as reference max_index = np.argmax(num_features_list) reference_map_path = feature_xml_lis[max_index] default_max_num_peaks_considered = 1000 default_max_scaling_value = 10.0 aligned_paths = [] for i, current_feature_xml_path in enumerate(feature_xml_lis): # load features into FeatureMaps reference_map = oms.FeatureMap( ) # pairwise alignment - so need master map - oms.FeatureXMLFile().load(reference_map_path, reference_map) current_map = oms.FeatureMap() oms.FeatureXMLFile().load(current_feature_xml_path, current_map) # create a transformation description required as init for aligner transformation_description = oms.TransformationDescription() # adjust max scaling parameter otherwise leads to error when running with algae samples # adjust max num peaks to 2k - also would leads to error when running with algae samples aligner = oms.MapAlignmentAlgorithmPoseClustering() aligner_params = aligner.getParameters() # print(aligner_params.asDict().keys()) max_scaling_key = b'superimposer:max_scaling' # aligner_params.getEntry(max_scaling_key) aligner_params.setValue(max_scaling_key, default_max_scaling_value) max_num_peaks_key = b'max_num_peaks_considered' # aligner_params.getEntry(max_num_peaks_key) aligner_params.setValue( max_num_peaks_key, default_max_num_peaks_considered) # default = 1000 # need higher default for algae # decrease runtime by removing weak signals # print(aligner_params.asDict()) num_used_points_key = b'superimposer:num_used_points' # aligner_params.getEntry(num_used_points_key) aligner_params.setValue( num_used_points_key, 1000) # half the default parameter, speed up alignment aligner.setParameters(aligner_params) aligner.setReference(reference_map) try: # run alignment aligner.align(current_map, transformation_description) except RuntimeError as re: if 'max_num_peaks_considered' in str(re): # retry with higher threshold - required for algae dataset default_max_num_peaks_considered = 15000 # 15 fold - makes it a lot slower but less error prone aligner_params.setValue(max_num_peaks_key, default_max_num_peaks_considered) default_max_scaling_value = 20.0 # need to increase to 20 aligner_params.setValue(max_scaling_key, default_max_scaling_value) # max shift could also be off - issue for ckd dataset default_max_shift_value = 2000.0 # need to increase from 1000 to 2000 max_shift_key = b'superimposer:max_shift' aligner_params.setValue(max_shift_key, default_max_shift_value) print( f"Encountered GC/MS Clustering issue - setting 'max_num_peaks_considered' to {default_max_num_peaks_considered}, 'superimposer:max_scaling' to {default_max_scaling_value} and 'superimposer:max_shift' to {default_max_shift_value}" ) aligner.setParameters(aligner_params) aligner.setReference(reference_map) aligner.align(current_map, transformation_description) current_map.updateRanges() reference_map.updateRanges() # update feature XML files - both reference and current updated_current_map_path = default_store_aligned_feature_xml( current_map, current_feature_xml_path) updated_reference_path = default_store_aligned_feature_xml( reference_map, reference_map_path) reference_map_path = updated_reference_path aligned_paths.append(updated_current_map_path) print(f"Finished alignment of {i}/{len(feature_xml_lis)-1}") # also replace here with new reference we updated the reference map to aligned_paths[max_index] = reference_map_path # link/group them across features to create consensus map grouper = oms.FeatureGroupingAlgorithmUnlabeled() # leave parameters default # according to openms documentation: # b) Call "setReference", "addToGroup" (n times), "getResultMap" in that order. for i, current_feature_map_path in enumerate(aligned_paths): print(f"Grouping features {i}/{len(aligned_paths)-1}") current_map = oms.FeatureMap() oms.FeatureXMLFile().load(current_feature_map_path, current_map) if not i: # first iteration - use as reference grouper.setReference(i, current_map) else: grouper.addToGroup(i, current_map) # get consensus map consensus_map = grouper.getResultMap() # consensus map requires some mapping between ids and filenames - otherwise will complain print(f"Mapping aligned results back to class labels") class_label_fns = list(class_label_dict.keys()) fds = {i: oms.ColumnHeader() for i, _ in enumerate(aligned_paths)} measurement_names = [] for i, aligned_path in enumerate(aligned_paths): # fds[i].filename = b"file0" current_fn = f"{str(Path(aligned_path).stem)}{str(Path(aligned_path).suffix)}" # this is where we need to replace the feature_xml filenames with the ones from class_labels if class_label_dict: # could do longest substring match with each of the fns in class_label dict to find matching filename # django will rename duplicate filenames instead of overwriting # or we expect both featureXML input and class_label_dict to be ordered - which they should be when using the getter fds[i].filename = class_label_fns[i] else: fds[i].filename = current_fn.encode( "UTF8") # needs bytestring representation measurement_names.append(current_fn) consensus_map.setColumnHeaders(fds) # cleanup aligned_feature_xmls - can be >30mb per file - so better remove them for ap in aligned_paths: os.remove(ap) # do consensus map normalization and export to consensus files # using median normalization, also available are Quantile and "robust regression" normalizer = oms.ConsensusMapNormalizerAlgorithmMedian() # ConsensusMapNormalizerAlgorithmMedian # signature of class is more than incomplete ... *args **kwargs for required parameters is not the best implementation choice... # but gives TypeError requiring int when calling with # normalizer.normalizeMaps(consensus_map, "NM_SCALE", "", "") # """ normalizer.normalizeMaps(map, method, acc_filter, desc_filter) map ConsensusMap method whether to use scaling or shifting to same median acc_filter string describing the regular expression for filtering accessions desc_filter string describing the regular expression for filtering descriptions """ """ method: probably 0 / 1 - referenced as Enumerator in OpenMS documentation from shell output can deduce normalization methods are 0: NM_SCALE scale to same median using division/multiplication 1: NM_SHIFT shift using subtraction/addition """ normalizer.normalizeMaps(consensus_map, 0, "", "") # don't export if not required - requires more file management # now export if consensus_map_out_path: print("Storing consensus xml") oms.ConsensusXMLFile().store(str(consensus_map_out_path), consensus_map) return consensus_map, measurement_names