示例#1
0
def calc_cos_scores(spectra, masses):
	if len(spectra) != len(masses):
		raise IndexError('spectra and masses must be lists of the same length')
	cosScoreTxt = open('cos_score_data', 'w')
	cosScores = []
	cosScoreTxt.write('[')
	for i,rSpec in enumerate(spectra):
		cosScoreTxt.write('[')
		temp = []
		for j,cSpec in enumerate(spectra):
			if j > 0 and j < len(spectra):
				cosScoreTxt.write(', ')
			if i == j:
				cosScoreTxt.write('1.0')
				temp.append(1.0)
			else:
				x = spectrum_alignment.score_alignment(rSpec, cSpec, masses[i], masses[j], 0.02)[0]
				cosScoreTxt.write(str(x))
				temp.append(x)
		if i < len(spectra)-1:
			cosScoreTxt.write('], ')
		else:
			cosScoreTxt.write(']')
		cosScores.append(temp)
	cosScoreTxt.write(']')
	cosScoreTxt.close()
	return cosScores
def process_spectra_similarity(metadata, spectral_data, path):
    fileout = open(path,"w")
    keys = metadata.keys()
    
    header = "The format is in ID, ID, similarity score" + "\n"
    fileout.write(header + "\n")
    
    # This lists holds tuples of indices
    # This list is used to prevent repeating comparisons
    reverse_order = []       
        
    for i,x in enumerate(keys):
        for j,y in enumerate(keys):
            
        # This condition prevents self comparison and repeating comparisons (happening in reverse order)
            if i != j and (i, j) not in reverse_order:             
                
                reverse_order.append((j,i))
        
        # The value is a list of tuples
        # Index 0 is the first tuple in the list (containing parent mass info)
        # Index 1 of the tuple is the parent mass value
                parent_mass1 = float(metadata[x][0][1])
                parent_mass2 = float(metadata[y][0][1]) 
                
                score, reported_alignments = score_alignment(spectral_data[x], 
                                                             spectral_data[y], parent_mass1, parent_mass2, 0.3)
                
                
                line = x + " " + y + " " + str(score) + "\n"
                fileout.write(line)
def choose_representative_spectrum_most_similary_combination_score(
        spectrum_list, minimum_spectra_for_combination=4):
    #Find the average spectrum, and then find the real spectrum that is closest to that average spectrum
    best_spectrum = None
    most_similar_score = -1000.0

    #for efficiency
    #spectrum_list = sorted(spectrum_list, key=lambda spectrum: spectrum["score"], reverse=True)
    total_scores_to_consider = len(spectrum_list)
    for spectrum in spectrum_list:
        spectrum_unique_key = spectrum["filename"] + ":" + str(
            spectrum["scan"])
        new_score_dict = {}
        existing_score_dict = {}

        if "score_dict" in spectrum:
            existing_score_dict = spectrum["score_dict"]

        all_scores = []
        for other_spectrum in spectrum_list:
            other_spectrum_unique_key = other_spectrum["filename"] + ":" + str(
                other_spectrum["scan"])

            if spectrum_unique_key == other_spectrum_unique_key:
                continue

            total_score = 0.0

            if other_spectrum_unique_key in existing_score_dict:
                total_score = existing_score_dict[other_spectrum_unique_key]
            else:
                total_score, reported_alignments = spectrum_alignment.score_alignment(
                    spectrum["peaks"], other_spectrum["peaks"], spectrum["mz"],
                    other_spectrum["mz"], 0.1)

            new_score_dict[other_spectrum_unique_key] = total_score
            all_scores.append(total_score)

        average_score = sum(all_scores) / float(total_scores_to_consider)
        if len(spectrum_list) < minimum_spectra_for_combination:
            explained_intensity = spectrum["explained_intensity"]
            annotated_ions = spectrum["number_of_ions_annotated_above_SNR"]
            average_score = average_score * explained_intensity * annotated_ions

        if average_score > most_similar_score:
            most_similar_score = average_score
            best_spectrum = spectrum

        #Saving the new score matrix
        spectrum["score_dict"] = new_score_dict

    #print(best_spectrum)
    return best_spectrum
 def cosine_spectrum(self, other_spectrum, peak_tolerance):
     total_score, reported_alignments = spectrum_alignment.score_alignment(self.peaks, other_spectrum.peaks, self.mz * self.charge, other_spectrum.mz * other_spectrum.charge, peak_tolerance, self.charge)
     return total_score, len(reported_alignments)
def create_masst_network(spectra_matches_df,
                         output_graphml,
                         output_image=None):
    # Loading all Datasets Information
    dataset_matches = list(set(spectra_matches_df["dataset_id"]))
    all_datasets = requests.get(
        "https://massive.ucsd.edu/ProteoSAFe/datasets_json.jsp#%7B%22query%22%3A%7B%7D%2C%22table_sort_history%22%3A%22createdMillis_dsc%22%7D"
    ).json()["datasets"]

    all_node_usi_list = []

    # Source MASST USI
    output_dict = {}
    output_dict[
        "usi"] = "mzspec:GNPS:TASK-c6b2797224f34d819d20dd7af622bc6b-spectra/:scan:1"
    output_dict["dataset"] = "QUERY"
    output_dict["scan"] = 1

    all_node_usi_list.append(output_dict)

    # Getting all the MASST data
    for dataset in dataset_matches:
        filtered_dataset = [
            current_dataset for current_dataset in all_datasets
            if current_dataset["dataset"] == dataset
        ]
        dataset_task = filtered_dataset[0]["task"]
        continuous_id = requests.get(
            "http://gnps.ucsd.edu/ProteoSAFe/ContinuousIDServlet?task={}".
            format(dataset_task)).json()

        network_url = "https://gnps.ucsd.edu/ProteoSAFe/result_json.jsp?task={}&view=clusters_network_pairs".format(
            continuous_id["jobs"][0]["task"])
        data = requests.get(network_url).json()['blockData']
        network_df = pd.DataFrame(data)

        dataset_spectra_matches = spectra_matches_df[
            spectra_matches_df["dataset_id"] == dataset]
        clusters_matched = list(set(dataset_spectra_matches["cluster_scan"]))

        # Grabbing identification information
        try:
            dataset_identifications = ming_gnps_library.get_dataset_current_continuous_identifications(
                dataset_task)
            dataset_identifications_df = pd.DataFrame(dataset_identifications)
            #print(dataset_identifications_df.columns)
        except:
            pass

        network_df["Node1"] = network_df["Node1"].astype(int)
        filtered_edges = network_df[network_df["Node1"].isin(clusters_matched)]

        for edge in filtered_edges.to_dict(orient="records"):
            cluster = edge["Node2"]
            usi = "mzspec:GNPS:TASK-{}-speccontinuous/speccontinuous-00000.mgf:scan:{}".format(
                continuous_id["jobs"][0]["task"], cluster)
            output_dict = {}
            output_dict["usi"] = usi
            output_dict["dataset"] = filtered_dataset[0]["dataset"]
            output_dict["scan"] = cluster

            try:
                filtered_identifications_df = dataset_identifications_df[
                    dataset_identifications_df["#Scan#"] == cluster]
                identification_dict = filtered_identifications_df.to_dict(
                    orient="records")[0]

                output_dict["Compound_Name"] = identification_dict[
                    "Compound_Name"]
                output_dict["Smiles"] = identification_dict["Smiles"]
                output_dict["INCHI"] = identification_dict["INCHI"]
                output_dict["MQScore"] = identification_dict["MQScore"]
                output_dict["SpectrumID"] = identification_dict["SpectrumID"]
            except:
                pass

            all_node_usi_list.append(output_dict)

    # Now we will load up all the spectra and do stuff with it
    from ming_spectrum_library import Spectrum
    import spectrum_alignment
    all_spectra_list = []
    for usi_dict in all_node_usi_list:
        usi = usi_dict["usi"]
        display_information = "{}:{}".format(usi_dict["dataset"],
                                             usi_dict["scan"])

        url = "https://metabolomics-usi.ucsd.edu/json/?usi={}".format(usi)
        spectrum_json = requests.get(url).json()

        spectrum = Spectrum("", display_information, display_information,
                            spectrum_json["peaks"],
                            spectrum_json["precursor_mz"], 1, 2)

        spectrum.dataset = usi_dict["dataset"]
        spectrum.usi = usi_dict["usi"]
        spectrum.Compound_Name = usi_dict.get("Compound_Name", "N/A")
        spectrum.Smiles = usi_dict.get("Smiles", "N/A")
        spectrum.INCHI = usi_dict.get("INCHI", "N/A")
        spectrum.MQScore = usi_dict.get("MQScore", "N/A")
        spectrum.SpectrumID = usi_dict.get("SpectrumID", "N/A")

        all_spectra_list.append(spectrum)

    min_score = 0.7
    min_matched_peaks = 5

    # Let's create a network now
    G = nx.Graph()
    from tqdm import tqdm
    for i, spectrum1 in tqdm(enumerate(all_spectra_list)):
        for j, spectrum2 in enumerate(all_spectra_list):
            if i <= j:
                continue

            if spectrum1.usi == spectrum2.usi:
                continue

            # Doing a network here
            total_score, reported_alignments = spectrum_alignment.score_alignment(
                spectrum1.peaks,
                spectrum2.peaks,
                spectrum1.mz,
                spectrum2.mz,
                0.5,
                max_charge_consideration=1)
            if total_score < min_score:
                continue

            if len(reported_alignments) < min_matched_peaks:
                continue

            G.add_edge(spectrum1.scan,
                       spectrum2.scan,
                       cosine_score=total_score,
                       matched_peaks=len(reported_alignments))

            # Adding Node Attributes
            G.nodes[spectrum1.scan]["mz"] = spectrum1.mz
            G.nodes[spectrum2.scan]["mz"] = spectrum2.mz
            G.nodes[spectrum1.scan]["dataset"] = spectrum1.dataset
            G.nodes[spectrum2.scan]["dataset"] = spectrum2.dataset
            G.nodes[spectrum1.scan]["Compound_Name"] = spectrum1.Compound_Name
            G.nodes[spectrum2.scan]["Compound_Name"] = spectrum2.Compound_Name
            G.nodes[spectrum1.scan]["Smiles"] = spectrum1.Smiles
            G.nodes[spectrum2.scan]["Smiles"] = spectrum2.Smiles
            G.nodes[spectrum1.scan]["INCHI"] = spectrum1.INCHI
            G.nodes[spectrum2.scan]["INCHI"] = spectrum2.INCHI
            G.nodes[spectrum1.scan]["MQScore"] = spectrum1.MQScore
            G.nodes[spectrum2.scan]["MQScore"] = spectrum2.MQScore

    import matplotlib.pyplot as plt
    import molecular_network_filtering_library

    molecular_network_filtering_library.filter_top_k(G, 10)

    nx.draw(G, with_labels=True, font_weight='bold')
    nx.write_graphml(G, output_graphml)
    if output_image is not None:
        plt.savefig(output_image, format="PNG")
def calculated_ambiguity(parameter_map, peak_tolerance):
    filename = parameter_map["filename"]
    scan_mapping = parameter_map["scan_mapping"]

    spectrum_collection = ming_spectrum_library.SpectrumCollection(filename)
    spectrum_collection.load_from_file()

    return_ambiguity_mapping = defaultdict(lambda: {})

    for scan in scan_mapping:
        spectrum_obj = spectrum_collection.scandict[int(scan)]
        #Lets determine if the strings are actually ambiguous
        ambiguous_list = ming_ambiguity_library.collapse_ambiguous_from_annotations_list(
            scan_mapping[scan])
        #print(ambiguous_list)
        if len(ambiguous_list) == 1:
            score_summary = {}
            score_summary["ambiguity_total_score"] = -1
            score_summary["first_unique_count"] = -1
            score_summary["second_unique_count"] = -1
            score_summary["first_unique_intensity"] = -1
            score_summary["second_unique_intensity"] = -1
            score_summary["first_second_unique_ratio"] = -1

            return_ambiguity_mapping[scan] = score_summary

            continue

        if len(ambiguous_list) > 2:
            score_summary = {}
            score_summary["ambiguity_total_score"] = 10
            score_summary["first_unique_count"] = 10
            score_summary["second_unique_count"] = 10
            score_summary["first_unique_intensity"] = 10
            score_summary["second_unique_intensity"] = 10
            score_summary["first_second_unique_ratio"] = -1

            return_ambiguity_mapping[scan] = score_summary
            continue

        peptide_to_extracted_peaks_mapping = {}
        for peptide in ambiguous_list:
            theoreteical_peaks = ming_psm_library.create_theoretical_peak_map(
                peptide, ["b", "y"])
            original_peaks = spectrum_obj.peaks
            extracted_peaks = extract_annotated_peaks(theoreteical_peaks,
                                                      original_peaks,
                                                      peak_tolerance)
            peptide_to_extracted_peaks_mapping[peptide] = extracted_peaks

            #print("Original:\t%d\tExtracted:\t%d" % (len(original_peaks), len(extracted_peaks)))
            #print(original_peaks)
            #print(extracted_peaks)
            #print(theoreteical_peaks)

        #Checkout overlap of stuff
        first_peaks = peptide_to_extracted_peaks_mapping[list(
            peptide_to_extracted_peaks_mapping.keys())[0]]
        second_peaks = peptide_to_extracted_peaks_mapping[list(
            peptide_to_extracted_peaks_mapping.keys())[1]]
        total_score, reported_alignments = spectrum_alignment.score_alignment(
            first_peaks, second_peaks, spectrum_obj.mz, spectrum_obj.mz,
            peak_tolerance)

        first_total = len(first_peaks)
        second_total = len(second_peaks)
        intersection_total = len(reported_alignments)
        first_unique_count = first_total - intersection_total
        second_unique_count = second_total - intersection_total

        #Calculating the explained intensity in each of these
        peaks_1_normed = spectrum_alignment.sqrt_normalize_spectrum(
            spectrum_alignment.convert_to_peaks(first_peaks))
        peaks_2_normed = spectrum_alignment.sqrt_normalize_spectrum(
            spectrum_alignment.convert_to_peaks(second_peaks))

        first_aligned_index = []
        second_aligned_index = []

        for alignment in reported_alignments:
            first_aligned_index.append(alignment.peak1)
            second_aligned_index.append(alignment.peak2)

        #intensity values
        first_unique = []
        second_unique = []

        for i in range(len(peaks_1_normed)):
            if not i in first_aligned_index:
                first_unique.append(peaks_1_normed[i][1])

        for i in range(len(peaks_2_normed)):
            if not i in second_aligned_index:
                second_unique.append(peaks_2_normed[i][1])

        first_unique_intensity = sum(i[0] * i[1]
                                     for i in zip(first_unique, first_unique))
        second_unique_intensity = sum(
            i[0] * i[1] for i in zip(second_unique, second_unique))

        first_second_unique_ratio = 0
        try:
            first_second_unique_ratio = min(
                first_unique_intensity, second_unique_intensity) / max(
                    first_unique_intensity, second_unique_intensity)
        except KeyboardInterrupt:
            raise
        except:
            first_second_unique_ratio = 10

        if first_second_unique_ratio > 10:
            first_second_unique_ratio = 10

        #print(reported_alignments)
        #print(peaks_1_normed)
        #print("FirstCount\t%d\tSecondCount\t%d\tFirstInt\t%f\tSecondInt\t%f" % (first_unique_count, second_unique_count, first_unique_intensity, second_unique_intensity))

        score_summary = {}
        score_summary["ambiguity_total_score"] = total_score
        score_summary["first_unique_count"] = first_unique_count
        score_summary["second_unique_count"] = second_unique_count
        score_summary["first_unique_intensity"] = first_unique_intensity
        score_summary["second_unique_intensity"] = second_unique_intensity
        score_summary["first_second_unique_ratio"] = first_second_unique_ratio

        return_ambiguity_mapping[scan] = score_summary

    return return_ambiguity_mapping
 def cosine_spectrum(self, other_spectrum, peak_tolerance):
     total_score, reported_alignments = spectrum_alignment.score_alignment(self.peaks, other_spectrum.peaks, self.mz * self.charge, other_spectrum.mz * other_spectrum.charge, peak_tolerance, self.charge)
     return total_score, len(reported_alignments)
示例#8
0
 def cosine_spectrum(self, other_spectrum, peak_tolerance):
     total_score, reported_alignments = spectrum_alignment.score_alignment(
         self.peaks, other_spectrum.peaks, self.mz, other_spectrum.mz,
         peak_tolerance)
     return total_score
 def cosine_spectrum(self, other_spectrum, peak_tolerance):
     total_score, reported_alignments = spectrum_alignment.score_alignment(self.peaks, other_spectrum.peaks, self.mz, other_spectrum.mz, peak_tolerance)
     return total_score